Skip to content

Instantly share code, notes, and snippets.

@bjacob
Created September 20, 2021 18:00
Show Gist options
  • Save bjacob/8c7f6ea08a15b52af5b3898dffbbb780 to your computer and use it in GitHub Desktop.
Save bjacob/8c7f6ea08a15b52af5b3898dffbbb780 to your computer and use it in GitHub Desktop.
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass //----- //
#map0 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
module {
func private @actual(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> attributes {noinline} {
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
return %0 : tensor<?x?xf32>
}
func private @expected(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> attributes {noinline} {
%0 = linalg.generic {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32): // no predecessors
%1 = mulf %arg3, %arg4 : f32
%2 = addf %1, %arg5 : f32
linalg.yield %2 : f32
} -> tensor<?x?xf32>
return %0 : tensor<?x?xf32>
}
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c10 = constant 10 : index
%c10_0 = constant 10 : index
%0 = linalg.init_tensor [%c10, %c10_0] : tensor<?x?xf32>
%1 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<?x?xf32>) outs(%0 : tensor<?x?xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%cst = constant 0.000000e+00 : f32
%cst_5 = constant 1.000000e+00 : f32
%11 = select %10, %cst, %cst_5 : f32
linalg.yield %11 : f32
} -> tensor<?x?xf32>
%c10_1 = constant 10 : index
%c10_2 = constant 10 : index
%2 = linalg.init_tensor [%c10_1, %c10_2] : tensor<?x?xf32>
%3 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<?x?xf32>) outs(%2 : tensor<?x?xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%cst = constant 0.000000e+00 : f32
%cst_5 = constant 1.000000e+00 : f32
%11 = select %10, %cst, %cst_5 : f32
linalg.yield %11 : f32
} -> tensor<?x?xf32>
%c10_3 = constant 10 : index
%c10_4 = constant 10 : index
%4 = linalg.init_tensor [%c10_3, %c10_4] : tensor<?x?xf32>
%5 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%cst = constant 0.000000e+00 : f32
%cst_5 = constant 1.000000e+00 : f32
%11 = select %10, %cst, %cst_5 : f32
linalg.yield %11 : f32
} -> tensor<?x?xf32>
%6 = call @actual(%1, %3, %5) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
%7 = call @expected(%1, %3, %5) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
check.expect_eq(%6, %7) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After Canonicalizer //----- //
func private @actual(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> attributes {noinline} {
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
return %0 : tensor<?x?xf32>
}
// -----// IR Dump After Canonicalizer //----- //
func private @expected(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> attributes {noinline} {
%0 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32): // no predecessors
%1 = mulf %arg3, %arg4 : f32
%2 = addf %1, %arg5 : f32
linalg.yield %2 : f32
} -> tensor<?x?xf32>
return %0 : tensor<?x?xf32>
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%11 = linalg.index 0 : index
%12 = linalg.index 1 : index
%13 = cmpi eq, %11, %12 : index
%14 = select %13, %cst_0, %cst : f32
linalg.yield %14 : f32
} -> tensor<10x10xf32>
%2 = tensor.cast %1 : tensor<10x10xf32> to tensor<?x?xf32>
%3 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%4 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%3 : tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%11 = linalg.index 0 : index
%12 = linalg.index 1 : index
%13 = cmpi eq, %11, %12 : index
%14 = select %13, %cst_0, %cst : f32
linalg.yield %14 : f32
} -> tensor<10x10xf32>
%5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32>
%6 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<10x10xf32>) outs(%6 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%11 = linalg.index 0 : index
%12 = linalg.index 1 : index
%13 = cmpi eq, %11, %12 : index
%14 = select %13, %cst_0, %cst : f32
linalg.yield %14 : f32
} -> tensor<10x10xf32>
%8 = tensor.cast %7 : tensor<10x10xf32> to tensor<?x?xf32>
%9 = call @actual(%2, %5, %8) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
%10 = call @expected(%2, %5, %8) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
check.expect_eq(%9, %10) : tensor<?x?xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%10 = linalg.index 0 : index
%11 = linalg.index 1 : index
%12 = cmpi eq, %10, %11 : index
%13 = select %12, %cst, %cst_0 : f32
linalg.yield %13 : f32
} -> tensor<10x10xf32>
%2 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<10x10xf32>) outs(%2 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%10 = linalg.index 0 : index
%11 = linalg.index 1 : index
%12 = cmpi eq, %10, %11 : index
%13 = select %12, %cst, %cst_0 : f32
linalg.yield %13 : f32
} -> tensor<10x10xf32>
%4 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<10x10xf32>) outs(%4 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%10 = linalg.index 0 : index
%11 = linalg.index 1 : index
%12 = cmpi eq, %10, %11 : index
%13 = select %12, %cst, %cst_0 : f32
linalg.yield %13 : f32
} -> tensor<10x10xf32>
%6 = linalg.matmul ins(%1, %3 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%5 : tensor<10x10xf32>) -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %3 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%5 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%10 = mulf %arg0, %arg1 : f32
%11 = addf %10, %arg2 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%9 = tensor.cast %8 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%7, %9) : tensor<?x?xf32>
return
}
// -----// IR Dump After Inliner //----- //
#map0 = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d0, d2)>
module {
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%10 = linalg.index 0 : index
%11 = linalg.index 1 : index
%12 = cmpi eq, %10, %11 : index
%13 = select %12, %cst, %cst_0 : f32
linalg.yield %13 : f32
} -> tensor<10x10xf32>
%2 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<10x10xf32>) outs(%2 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%10 = linalg.index 0 : index
%11 = linalg.index 1 : index
%12 = cmpi eq, %10, %11 : index
%13 = select %12, %cst, %cst_0 : f32
linalg.yield %13 : f32
} -> tensor<10x10xf32>
%4 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%5 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<10x10xf32>) outs(%4 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%10 = linalg.index 0 : index
%11 = linalg.index 1 : index
%12 = cmpi eq, %10, %11 : index
%13 = select %12, %cst, %cst_0 : f32
linalg.yield %13 : f32
} -> tensor<10x10xf32>
%6 = linalg.matmul ins(%1, %3 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%5 : tensor<10x10xf32>) -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
%8 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %3 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%5 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%10 = mulf %arg0, %arg1 : f32
%11 = addf %10, %arg2 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%9 = tensor.cast %8 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%7, %9) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%10 = linalg.index 0 : index
%11 = linalg.index 1 : index
%12 = cmpi eq, %10, %11 : index
%13 = select %12, %cst_0, %cst : f32
linalg.yield %13 : f32
} -> tensor<10x10xf32>
%2 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<10x10xf32>) outs(%2 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%10 = linalg.index 0 : index
%11 = linalg.index 1 : index
%12 = cmpi eq, %10, %11 : index
%13 = select %12, %cst_0, %cst : f32
linalg.yield %13 : f32
} -> tensor<10x10xf32>
%4 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<10x10xf32>) outs(%4 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%10 = linalg.index 0 : index
%11 = linalg.index 1 : index
%12 = cmpi eq, %10, %11 : index
%13 = select %12, %cst_0, %cst : f32
linalg.yield %13 : f32
} -> tensor<10x10xf32>
%6 = linalg.matmul ins(%1, %3 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%5 : tensor<10x10xf32>) -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %3 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%5 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%10 = mulf %arg0, %arg1 : f32
%11 = addf %10, %arg2 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%9 = tensor.cast %8 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%7, %9) : tensor<?x?xf32>
return
}
// -----// IR Dump After CSE //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%8 = mulf %arg0, %arg1 : f32
%9 = addf %8, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%5, %7) : tensor<?x?xf32>
return
}
// -----// IR Dump After SymbolDCE //----- //
#map0 = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d0, d2)>
module {
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%2 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32>
%6 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%8 = mulf %arg0, %arg1 : f32
%9 = addf %8, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%5, %7) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After IREEImportPublic //----- //
#map0 = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d0, d2)>
module {
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%2 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32>
%6 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%8 = mulf %arg0, %arg1 : f32
%9 = addf %8, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%5, %7) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After VerifyInputLegality //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%8 = mulf %arg0, %arg1 : f32
%9 = addf %8, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%5, %7) : tensor<?x?xf32>
return
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::{anonymous}::SimplifyGlobalAccessesPass //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%8 = mulf %arg0, %arg1 : f32
%9 = addf %8, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%5, %7) : tensor<?x?xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%8 = mulf %arg0, %arg1 : f32
%9 = addf %8, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%5, %7) : tensor<?x?xf32>
return
}
// -----// IR Dump After CSE //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%8 = mulf %arg0, %arg1 : f32
%9 = addf %8, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%5, %7) : tensor<?x?xf32>
return
}
// -----// IR Dump After ExpandGlobalDynamicDims //----- //
#map0 = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d0, d2)>
module {
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%2 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32>
%6 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%8 = mulf %arg0, %arg1 : f32
%9 = addf %8, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%5, %7) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::Shape::{anonymous}::ExpandFunctionDynamicDimsPass //----- //
#map0 = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d0, d2)>
module {
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%2 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32>
%6 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%8 = mulf %arg0, %arg1 : f32
%9 = addf %8, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%5, %7) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After ConvertConv2D1x1ConvToMatmul //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%8 = mulf %arg0, %arg1 : f32
%9 = addf %8, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%5, %7) : tensor<?x?xf32>
return
}
// -----// IR Dump After PadTensorToSubTensorInsert //----- //
#map0 = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d0, d2)>
module {
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%2 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32>
%6 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%8 = mulf %arg0, %arg1 : f32
%9 = addf %8, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%5, %7) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After ConvertElementwiseToLinalg //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%8 = mulf %arg0, %arg1 : f32
%9 = addf %8, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%5, %7) : tensor<?x?xf32>
return
}
// -----// IR Dump After LinalgFoldUnitExtentDims //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%8 = mulf %arg0, %arg1 : f32
%9 = addf %8, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%5, %7) : tensor<?x?xf32>
return
}
// -----// IR Dump After InterchangeGenericOps //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%8 = mulf %arg0, %arg1 : f32
%9 = addf %8, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%5, %7) : tensor<?x?xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%8 = mulf %arg0, %arg1 : f32
%9 = addf %8, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%5, %7) : tensor<?x?xf32>
return
}
// -----// IR Dump After FusionOfTensorOps //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%8 = mulf %arg0, %arg1 : f32
%9 = addf %8, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%5, %7) : tensor<?x?xf32>
return
}
// -----// IR Dump After CSE //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%8 = mulf %arg0, %arg1 : f32
%9 = addf %8, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%5, %7) : tensor<?x?xf32>
return
}
// -----// IR Dump After ResolveShapedTypeResultDims //----- //
#map0 = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%2 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = tensor.cast %4 : tensor<10x10xf32> to tensor<?x?xf32>
%6 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%8 = mulf %arg0, %arg1 : f32
%9 = addf %8, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<10x10xf32>
%7 = tensor.cast %6 : tensor<10x10xf32> to tensor<?x?xf32>
check.expect_eq(%5, %7) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After ConvertToFlowBeforeDispatchFormation //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%8 = mulf %arg0, %arg1 : f32
%9 = addf %8, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<10x10xf32>
%7 = flow.tensor.reshape %6 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
check.expect_eq(%5, %7) : tensor<?x?xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x10xf32>) outs(%0 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%8 = linalg.index 0 : index
%9 = linalg.index 1 : index
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
linalg.yield %11 : f32
} -> tensor<10x10xf32>
%4 = linalg.matmul ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%1, %2 : tensor<10x10xf32>, tensor<10x10xf32>) outs(%3 : tensor<10x10xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
%8 = mulf %arg0, %arg1 : f32
%9 = addf %8, %arg2 : f32
linalg.yield %9 : f32
} -> tensor<10x10xf32>
%7 = flow.tensor.reshape %6 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
check.expect_eq(%5, %7) : tensor<?x?xf32>
return
}
// -----// IR Dump After DispatchLinalgOnTensors //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c10 = constant 10 : index
%c1 = constant 1 : index
%0 = flow.dispatch.workgroups[%c10, %c10, %c1]() : () -> tensor<10x10xf32> =
(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10_1 = constant 10 : index
%6 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg1 = %7 to %c10_1 step %8 {
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %9 to %c10_1 step %10 {
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1)
%12 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2)
%13 = tensor.extract_slice %6[%arg1, %arg2] [%11, %12] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1)
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2)
%16 = tensor.extract_slice %6[%arg1, %arg2] [%14, %15] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<?x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%18, %arg1)
%20 = linalg.index 1 : index
%21 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%20, %arg2)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst_0, %cst : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg0, offsets = [%arg1, %arg2], sizes = [%14, %15], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
flow.return
}
%1 = flow.dispatch.workgroups[%c10, %c10, %c1]() : () -> tensor<10x10xf32> =
(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10_1 = constant 10 : index
%6 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg1 = %7 to %c10_1 step %8 {
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %9 to %c10_1 step %10 {
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1)
%12 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2)
%13 = tensor.extract_slice %6[%arg1, %arg2] [%11, %12] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1)
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2)
%16 = tensor.extract_slice %6[%arg1, %arg2] [%14, %15] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<?x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%18, %arg1)
%20 = linalg.index 1 : index
%21 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%20, %arg2)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst_0, %cst : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg0, offsets = [%arg1, %arg2], sizes = [%14, %15], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
flow.return
}
%2 = flow.dispatch.workgroups[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> =
(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10_1 = constant 10 : index
%6 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %7 to %c10_1 step %8 {
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %9 to %c10_1 step %10 {
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3)
%12 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%11, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4)
%14 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %13], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3)
%16 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4)
%17 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1)
%18 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0)
%19 = tensor.extract_slice %6[%arg3, %arg4] [%17, %18] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%20 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1)
%21 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0)
%22 = tensor.extract_slice %6[%arg3, %arg4] [%20, %21] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%19 : tensor<?x?xf32>) outs(%22 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%25 = linalg.index 0 : index
%26 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%25, %arg3)
%27 = linalg.index 1 : index
%28 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%27, %arg4)
%29 = cmpi eq, %26, %28 : index
%30 = select %29, %cst_0, %cst : f32
linalg.yield %30 : f32
} -> tensor<?x?xf32>
%24 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%12, %14 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%23 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %24, %arg2, offsets = [%arg3, %arg4], sizes = [%15, %16], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
flow.return
}
%3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
%4 = flow.dispatch.workgroups[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> =
(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10_1 = constant 10 : index
%6 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %7 to %c10_1 step %8 {
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %9 to %c10_1 step %10 {
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3)
%12 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%11, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4)
%14 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %13], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3)
%16 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4)
%17 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1)
%18 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0)
%19 = tensor.extract_slice %6[%arg3, %arg4] [%17, %18] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%20 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1)
%21 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0)
%22 = tensor.extract_slice %6[%arg3, %arg4] [%20, %21] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%19 : tensor<?x?xf32>) outs(%22 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%25 = linalg.index 0 : index
%26 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%25, %arg3)
%27 = linalg.index 1 : index
%28 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%27, %arg4)
%29 = cmpi eq, %26, %28 : index
%30 = select %29, %cst_0, %cst : f32
linalg.yield %30 : f32
} -> tensor<?x?xf32>
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%12, %14 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%23 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%25 = mulf %arg5, %arg6 : f32
%26 = addf %25, %arg7 : f32
linalg.yield %26 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %24, %arg2, offsets = [%arg3, %arg4], sizes = [%15, %16], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
flow.return
}
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
check.expect_eq(%3, %5) : tensor<?x?xf32>
return
}
// -----// IR Dump After ResolveShapedTypeResultDims //----- //
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0, d1) -> (d0, -d1 + 10)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#map3 = affine_map<(d0, d1) -> (d0 + d1)>
#map4 = affine_map<(d0, d1) -> (-d0 + 10, d1)>
#map5 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map6 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map7 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c10 = constant 10 : index
%c1 = constant 1 : index
%0 = flow.dispatch.workgroups[%c10, %c10, %c1]() : () -> tensor<10x10xf32> =
(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10_1 = constant 10 : index
%6 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%7 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%8 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg1 = %7 to %c10_1 step %8 {
%9 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%10 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %9 to %c10_1 step %10 {
%11 = affine.min #map1(%workgroup_size_1, %arg1)
%12 = affine.min #map1(%workgroup_size_0, %arg2)
%13 = tensor.extract_slice %6[%arg1, %arg2] [%11, %12] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%14 = affine.min #map1(%workgroup_size_1, %arg1)
%15 = affine.min #map1(%workgroup_size_0, %arg2)
%16 = tensor.extract_slice %6[%arg1, %arg2] [%14, %15] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<?x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg1)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg2)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst_0, %cst : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg0, offsets = [%arg1, %arg2], sizes = [%14, %15], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
flow.return
}
%1 = flow.dispatch.workgroups[%c10, %c10, %c1]() : () -> tensor<10x10xf32> =
(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10_1 = constant 10 : index
%6 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%7 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%8 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg1 = %7 to %c10_1 step %8 {
%9 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%10 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %9 to %c10_1 step %10 {
%11 = affine.min #map1(%workgroup_size_1, %arg1)
%12 = affine.min #map1(%workgroup_size_0, %arg2)
%13 = tensor.extract_slice %6[%arg1, %arg2] [%11, %12] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%14 = affine.min #map1(%workgroup_size_1, %arg1)
%15 = affine.min #map1(%workgroup_size_0, %arg2)
%16 = tensor.extract_slice %6[%arg1, %arg2] [%14, %15] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<?x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg1)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg2)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst_0, %cst : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg0, offsets = [%arg1, %arg2], sizes = [%14, %15], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
flow.return
}
%2 = flow.dispatch.workgroups[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> =
(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10_1 = constant 10 : index
%6 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%7 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%8 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %7 to %c10_1 step %8 {
%9 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%10 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %9 to %c10_1 step %10 {
%11 = affine.min #map1(%workgroup_size_1, %arg3)
%12 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%11, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%13 = affine.min #map1(%workgroup_size_0, %arg4)
%14 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %13], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%15 = affine.min #map1(%workgroup_size_1, %arg3)
%16 = affine.min #map1(%workgroup_size_0, %arg4)
%17 = affine.min #map4(%arg3, %workgroup_size_1)
%18 = affine.min #map4(%arg4, %workgroup_size_0)
%19 = tensor.extract_slice %6[%arg3, %arg4] [%17, %18] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%20 = affine.min #map4(%arg3, %workgroup_size_1)
%21 = affine.min #map4(%arg4, %workgroup_size_0)
%22 = tensor.extract_slice %6[%arg3, %arg4] [%20, %21] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%23 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%19 : tensor<?x?xf32>) outs(%22 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%25 = linalg.index 0 : index
%26 = affine.apply #map3(%25, %arg3)
%27 = linalg.index 1 : index
%28 = affine.apply #map3(%27, %arg4)
%29 = cmpi eq, %26, %28 : index
%30 = select %29, %cst_0, %cst : f32
linalg.yield %30 : f32
} -> tensor<?x?xf32>
%24 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%12, %14 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%23 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %24, %arg2, offsets = [%arg3, %arg4], sizes = [%15, %16], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
flow.return
}
%3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
%4 = flow.dispatch.workgroups[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> =
(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10_1 = constant 10 : index
%6 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%7 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%8 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %7 to %c10_1 step %8 {
%9 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%10 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %9 to %c10_1 step %10 {
%11 = affine.min #map1(%workgroup_size_1, %arg3)
%12 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%11, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%13 = affine.min #map1(%workgroup_size_0, %arg4)
%14 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %13], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%15 = affine.min #map1(%workgroup_size_1, %arg3)
%16 = affine.min #map1(%workgroup_size_0, %arg4)
%17 = affine.min #map4(%arg3, %workgroup_size_1)
%18 = affine.min #map4(%arg4, %workgroup_size_0)
%19 = tensor.extract_slice %6[%arg3, %arg4] [%17, %18] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%20 = affine.min #map4(%arg3, %workgroup_size_1)
%21 = affine.min #map4(%arg4, %workgroup_size_0)
%22 = tensor.extract_slice %6[%arg3, %arg4] [%20, %21] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%23 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%19 : tensor<?x?xf32>) outs(%22 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%25 = linalg.index 0 : index
%26 = affine.apply #map3(%25, %arg3)
%27 = linalg.index 1 : index
%28 = affine.apply #map3(%27, %arg4)
%29 = cmpi eq, %26, %28 : index
%30 = select %29, %cst_0, %cst : f32
linalg.yield %30 : f32
} -> tensor<?x?xf32>
%24 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%12, %14 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%23 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%25 = mulf %arg5, %arg6 : f32
%26 = addf %25, %arg7 : f32
linalg.yield %26 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %24, %arg2, offsets = [%arg3, %arg4], sizes = [%15, %16], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
flow.return
}
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
check.expect_eq(%3, %5) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After ConvertToFlowAfterDispatchFormation //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c10 = constant 10 : index
%c1 = constant 1 : index
%0 = flow.dispatch.workgroups[%c10, %c10, %c1]() : () -> tensor<10x10xf32> =
(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10_1 = constant 10 : index
%6 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg1 = %7 to %c10_1 step %8 {
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %9 to %c10_1 step %10 {
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1)
%12 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2)
%13 = tensor.extract_slice %6[%arg1, %arg2] [%11, %12] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1)
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2)
%16 = tensor.extract_slice %6[%arg1, %arg2] [%14, %15] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<?x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%18, %arg1)
%20 = linalg.index 1 : index
%21 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%20, %arg2)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst_0, %cst : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg0, offsets = [%arg1, %arg2], sizes = [%14, %15], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
flow.return
}
%1 = flow.dispatch.workgroups[%c10, %c10, %c1]() : () -> tensor<10x10xf32> =
(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10_1 = constant 10 : index
%6 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg1 = %7 to %c10_1 step %8 {
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %9 to %c10_1 step %10 {
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1)
%12 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2)
%13 = tensor.extract_slice %6[%arg1, %arg2] [%11, %12] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1)
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2)
%16 = tensor.extract_slice %6[%arg1, %arg2] [%14, %15] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<?x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%18, %arg1)
%20 = linalg.index 1 : index
%21 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%20, %arg2)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst_0, %cst : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg0, offsets = [%arg1, %arg2], sizes = [%14, %15], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
flow.return
}
%2 = flow.dispatch.workgroups[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> =
(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10_1 = constant 10 : index
%6 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %7 to %c10_1 step %8 {
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %9 to %c10_1 step %10 {
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3)
%12 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%11, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4)
%14 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %13], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3)
%16 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4)
%17 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1)
%18 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0)
%19 = tensor.extract_slice %6[%arg3, %arg4] [%17, %18] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%20 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1)
%21 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0)
%22 = tensor.extract_slice %6[%arg3, %arg4] [%20, %21] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%19 : tensor<?x?xf32>) outs(%22 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%25 = linalg.index 0 : index
%26 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%25, %arg3)
%27 = linalg.index 1 : index
%28 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%27, %arg4)
%29 = cmpi eq, %26, %28 : index
%30 = select %29, %cst_0, %cst : f32
linalg.yield %30 : f32
} -> tensor<?x?xf32>
%24 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%12, %14 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%23 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %24, %arg2, offsets = [%arg3, %arg4], sizes = [%15, %16], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
flow.return
}
%3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
%4 = flow.dispatch.workgroups[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> =
(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10_1 = constant 10 : index
%6 = linalg.init_tensor [10, 10] : tensor<10x10xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %7 to %c10_1 step %8 {
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %9 to %c10_1 step %10 {
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3)
%12 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%11, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4)
%14 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %13], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3)
%16 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4)
%17 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1)
%18 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0)
%19 = tensor.extract_slice %6[%arg3, %arg4] [%17, %18] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%20 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1)
%21 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0)
%22 = tensor.extract_slice %6[%arg3, %arg4] [%20, %21] [1, 1] : tensor<10x10xf32> to tensor<?x?xf32>
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%19 : tensor<?x?xf32>) outs(%22 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%25 = linalg.index 0 : index
%26 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%25, %arg3)
%27 = linalg.index 1 : index
%28 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%27, %arg4)
%29 = cmpi eq, %26, %28 : index
%30 = select %29, %cst_0, %cst : f32
linalg.yield %30 : f32
} -> tensor<?x?xf32>
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%12, %14 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%23 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%25 = mulf %arg5, %arg6 : f32
%26 = addf %25, %arg7 : f32
linalg.yield %26 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %24, %arg2, offsets = [%arg3, %arg4], sizes = [%15, %16], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
flow.return
}
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
check.expect_eq(%3, %5) : tensor<?x?xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c1 = constant 1 : index
%c10 = constant 10 : index
%0 = flow.dispatch.workgroups[%c10, %c10, %c1]() : () -> tensor<10x10xf32> =
(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10_0 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_1 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg1 = %6 to %c10_0 step %7 {
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %8 to %c10_0 step %9 {
%10 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1)
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2)
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1)
%14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2)
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%17 = linalg.index 0 : index
%18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1)
%19 = linalg.index 1 : index
%20 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%19, %arg2)
%21 = cmpi eq, %18, %20 : index
%22 = select %21, %cst, %cst_1 : f32
linalg.yield %22 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %16, %arg0, offsets = [%arg1, %arg2], sizes = [%13, %14], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
flow.return
}
%1 = flow.dispatch.workgroups[%c10, %c10, %c1]() : () -> tensor<10x10xf32> =
(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10_0 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_1 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg1 = %6 to %c10_0 step %7 {
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %8 to %c10_0 step %9 {
%10 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1)
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2)
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg1)
%14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg2)
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%17 = linalg.index 0 : index
%18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1)
%19 = linalg.index 1 : index
%20 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%19, %arg2)
%21 = cmpi eq, %18, %20 : index
%22 = select %21, %cst, %cst_1 : f32
linalg.yield %22 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %16, %arg0, offsets = [%arg1, %arg2], sizes = [%13, %14], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
flow.return
}
%2 = flow.dispatch.workgroups[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> =
(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10_0 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_1 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %6 to %c10_0 step %7 {
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %8 to %c10_0 step %9 {
%10 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3)
%11 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%10, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%12 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4)
%13 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %12], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3)
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4)
%16 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1)
%17 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0)
%18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32>
%19 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1)
%20 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0)
%21 = linalg.init_tensor [%19, %20] : tensor<?x?xf32>
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%18 : tensor<?x?xf32>) outs(%21 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%24 = linalg.index 0 : index
%25 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%24, %arg3)
%26 = linalg.index 1 : index
%27 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%26, %arg4)
%28 = cmpi eq, %25, %27 : index
%29 = select %28, %cst, %cst_1 : f32
linalg.yield %29 : f32
} -> tensor<?x?xf32>
%23 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%11, %13 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%22 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %23, %arg2, offsets = [%arg3, %arg4], sizes = [%14, %15], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
flow.return
}
%3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
%4 = flow.dispatch.workgroups[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> =
(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10_0 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_1 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %6 to %c10_0 step %7 {
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %8 to %c10_0 step %9 {
%10 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3)
%11 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%10, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%12 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4)
%13 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %12], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_1, %arg3)
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 10)>(%workgroup_size_0, %arg4)
%16 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1)
%17 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0)
%18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32>
%19 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg3, %workgroup_size_1)
%20 = affine.min affine_map<(d0, d1) -> (-d0 + 10, d1)>(%arg4, %workgroup_size_0)
%21 = linalg.init_tensor [%19, %20] : tensor<?x?xf32>
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%18 : tensor<?x?xf32>) outs(%21 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%24 = linalg.index 0 : index
%25 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%24, %arg3)
%26 = linalg.index 1 : index
%27 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%26, %arg4)
%28 = cmpi eq, %25, %27 : index
%29 = select %28, %cst, %cst_1 : f32
linalg.yield %29 : f32
} -> tensor<?x?xf32>
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %13 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%22 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%24 = mulf %arg5, %arg6 : f32
%25 = addf %24, %arg7 : f32
linalg.yield %25 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %23, %arg2, offsets = [%arg3, %arg4], sizes = [%14, %15], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
flow.return
}
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
check.expect_eq(%3, %5) : tensor<?x?xf32>
return
}
// -----// IR Dump After OutlineDispatchRegions //----- //
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0, d1) -> (d0, -d1 + 10)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#map3 = affine_map<(d0, d1) -> (d0 + d1)>
#map4 = affine_map<(d0, d1) -> (-d0 + 10, d1)>
#map5 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map6 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map7 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
flow.executable private @matmul_test_dispatch_0 {
flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg1 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %2 to %c10 step %3 {
%4 = affine.min #map1(%workgroup_size_1, %arg1)
%5 = affine.min #map1(%workgroup_size_0, %arg2)
%6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32>
%7 = affine.min #map1(%workgroup_size_1, %arg1)
%8 = affine.min #map1(%workgroup_size_0, %arg2)
%9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32>
%10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%11 = linalg.index 0 : index
%12 = affine.apply #map3(%11, %arg1)
%13 = linalg.index 1 : index
%14 = affine.apply #map3(%13, %arg2)
%15 = cmpi eq, %12, %14 : index
%16 = select %15, %cst, %cst_0 : f32
linalg.yield %16 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_1 {
flow.dispatch.entry public @matmul_test_dispatch_1 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_1(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg1 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %2 to %c10 step %3 {
%4 = affine.min #map1(%workgroup_size_1, %arg1)
%5 = affine.min #map1(%workgroup_size_0, %arg2)
%6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32>
%7 = affine.min #map1(%workgroup_size_1, %arg1)
%8 = affine.min #map1(%workgroup_size_0, %arg2)
%9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32>
%10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%11 = linalg.index 0 : index
%12 = affine.apply #map3(%11, %arg1)
%13 = linalg.index 1 : index
%14 = affine.apply #map3(%13, %arg2)
%15 = cmpi eq, %12, %14 : index
%16 = select %15, %cst, %cst_0 : f32
linalg.yield %16 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_2 {
flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%workgroup_size_1, %arg3)
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%workgroup_size_0, %arg4)
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%workgroup_size_1, %arg3)
%9 = affine.min #map1(%workgroup_size_0, %arg4)
%10 = affine.min #map4(%arg3, %workgroup_size_1)
%11 = affine.min #map4(%arg4, %workgroup_size_0)
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3, %workgroup_size_1)
%14 = affine.min #map4(%arg4, %workgroup_size_0)
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst, %cst_0 : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_3 {
flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%workgroup_size_1, %arg3)
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%workgroup_size_0, %arg4)
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%workgroup_size_1, %arg3)
%9 = affine.min #map1(%workgroup_size_0, %arg4)
%10 = affine.min #map4(%arg3, %workgroup_size_1)
%11 = affine.min #map4(%arg4, %workgroup_size_0)
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3, %workgroup_size_1)
%14 = affine.min #map4(%arg4, %workgroup_size_0)
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst, %cst_0 : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%18 = mulf %arg5, %arg6 : f32
%19 = addf %18, %arg7 : f32
linalg.yield %19 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c1 = constant 1 : index
%c10 = constant 10 : index
%0 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32>
%1 = flow.dispatch @matmul_test_dispatch_1::@matmul_test_dispatch_1[%c10, %c10, %c1]() : () -> tensor<10x10xf32>
%2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
%4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
check.expect_eq(%3, %5) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c10 = constant 10 : index
%c1 = constant 1 : index
%0 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32>
%1 = flow.dispatch @matmul_test_dispatch_1::@matmul_test_dispatch_1[%c10, %c10, %c1]() : () -> tensor<10x10xf32>
%2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
%4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
check.expect_eq(%3, %5) : tensor<?x?xf32>
return
}
// -----// IR Dump After DeduplicateExecutables //----- //
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0, d1) -> (d0, -d1 + 10)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#map3 = affine_map<(d0, d1) -> (d0 + d1)>
#map4 = affine_map<(d0, d1) -> (-d0 + 10, d1)>
#map5 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map6 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map7 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
flow.executable private @matmul_test_dispatch_0 {
flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg1 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %2 to %c10 step %3 {
%4 = affine.min #map1(%workgroup_size_1, %arg1)
%5 = affine.min #map1(%workgroup_size_0, %arg2)
%6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32>
%7 = affine.min #map1(%workgroup_size_1, %arg1)
%8 = affine.min #map1(%workgroup_size_0, %arg2)
%9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32>
%10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%11 = linalg.index 0 : index
%12 = affine.apply #map3(%11, %arg1)
%13 = linalg.index 1 : index
%14 = affine.apply #map3(%13, %arg2)
%15 = cmpi eq, %12, %14 : index
%16 = select %15, %cst, %cst_0 : f32
linalg.yield %16 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_2 {
flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%workgroup_size_1, %arg3)
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%workgroup_size_0, %arg4)
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%workgroup_size_1, %arg3)
%9 = affine.min #map1(%workgroup_size_0, %arg4)
%10 = affine.min #map4(%arg3, %workgroup_size_1)
%11 = affine.min #map4(%arg4, %workgroup_size_0)
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3, %workgroup_size_1)
%14 = affine.min #map4(%arg4, %workgroup_size_0)
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst, %cst_0 : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_3 {
flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%workgroup_size_1, %arg3)
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%workgroup_size_0, %arg4)
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%workgroup_size_1, %arg3)
%9 = affine.min #map1(%workgroup_size_0, %arg4)
%10 = affine.min #map4(%arg3, %workgroup_size_1)
%11 = affine.min #map4(%arg4, %workgroup_size_0)
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3, %workgroup_size_1)
%14 = affine.min #map4(%arg4, %workgroup_size_0)
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst, %cst_0 : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%18 = mulf %arg5, %arg6 : f32
%19 = addf %18, %arg7 : f32
linalg.yield %19 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c10 = constant 10 : index
%c1 = constant 1 : index
%0 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32>
%1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32>
%2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
%4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
check.expect_eq(%3, %5) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c1 = constant 1 : index
%c10 = constant 10 : index
%0 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32>
%1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32>
%2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
%4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10, %c10, %c1](%0, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
check.expect_eq(%3, %5) : tensor<?x?xf32>
return
}
// -----// IR Dump After CSE //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c1 = constant 1 : index
%c10 = constant 10 : index
%0 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32>
%1 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10, %c10, %c1](%0, %0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%2 = flow.tensor.reshape %1 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
%3 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10, %c10, %c1](%0, %0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%4 = flow.tensor.reshape %3 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
check.expect_eq(%2, %4) : tensor<?x?xf32>
return
}
// -----// IR Dump After HoistUnstreamableOps //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c10 = constant 10 : index
%c1 = constant 1 : index
%0 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32>
%1 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10, %c10, %c1](%0, %0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%2 = flow.tensor.reshape %1 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
%3 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10, %c10, %c1](%0, %0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%4 = flow.tensor.reshape %3 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
check.expect_eq(%2, %4) : tensor<?x?xf32>
return
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c1 = constant 1 : index
%c10 = constant 10 : index
%0 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32>
%1 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10, %c10, %c1](%0, %0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%2 = flow.tensor.reshape %1 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
%3 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10, %c10, %c1](%0, %0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%4 = flow.tensor.reshape %3 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
check.expect_eq(%2, %4) : tensor<?x?xf32>
return
}
// -----// IR Dump After CSE //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c1 = constant 1 : index
%c10 = constant 10 : index
%0 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32>
%1 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10, %c10, %c1](%0, %0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%2 = flow.tensor.reshape %1 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
%3 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10, %c10, %c1](%0, %0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%4 = flow.tensor.reshape %3 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
check.expect_eq(%2, %4) : tensor<?x?xf32>
return
}
// -----// IR Dump After InsertConstantClones //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c1 = constant 1 : index
%c10 = constant 10 : index
%0 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10, %c10, %c1]() : () -> tensor<10x10xf32>
%1 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10, %c10, %c1](%0, %0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%2 = flow.tensor.reshape %1 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
%3 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10, %c10, %c1](%0, %0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%4 = flow.tensor.reshape %3 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10, %c10}
check.expect_eq(%2, %4) : tensor<?x?xf32>
return
}
// -----// IR Dump After FormStreams //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c1 = constant 1 : index
%c10 = constant 10 : index
%0 = shapex.make_ranked_shape %c10, %c10 : (index, index) -> !shapex.ranked_shape<[?,?]>
%1 = shapex.ranked_dim %0[0] : !shapex.ranked_shape<[?,?]> -> index
%2 = shapex.ranked_dim %0[1] : !shapex.ranked_shape<[?,?]> -> index
%3 = shapex.make_ranked_shape %c10, %c10 : (index, index) -> !shapex.ranked_shape<[?,?]>
%4 = shapex.ranked_dim %3[0] : !shapex.ranked_shape<[?,?]> -> index
%5 = shapex.ranked_dim %3[1] : !shapex.ranked_shape<[?,?]> -> index
%6:2 = flow.ex.stream.fragment(%c10, %c1) : (index, index) -> (tensor<?x?xf32>{%1, %2}, tensor<?x?xf32>{%4, %5}) =
(%arg0: index, %arg1: index) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
%7 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%arg0, %arg0, %arg1]() : () -> tensor<10x10xf32>
%8 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%arg0, %arg0, %arg1](%7, %7) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%9 = flow.tensor.reshape %8 : tensor<10x10xf32> -> tensor<?x?xf32>{%arg0, %arg0}
%10 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%arg0, %arg0, %arg1](%7, %7) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%11 = flow.tensor.reshape %10 : tensor<10x10xf32> -> tensor<?x?xf32>{%arg0, %arg0}
flow.return %9, %11 : tensor<?x?xf32>, tensor<?x?xf32>
}
check.expect_eq(%6#0, %6#1) : tensor<?x?xf32>
return
}
// -----// IR Dump After OutlineLargeConstants //----- //
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0, d1) -> (d0, -d1 + 10)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#map3 = affine_map<(d0, d1) -> (d0 + d1)>
#map4 = affine_map<(d0, d1) -> (-d0 + 10, d1)>
#map5 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map6 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map7 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
flow.executable private @matmul_test_dispatch_0 {
flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg1 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %2 to %c10 step %3 {
%4 = affine.min #map1(%workgroup_size_1, %arg1)
%5 = affine.min #map1(%workgroup_size_0, %arg2)
%6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32>
%7 = affine.min #map1(%workgroup_size_1, %arg1)
%8 = affine.min #map1(%workgroup_size_0, %arg2)
%9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32>
%10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%11 = linalg.index 0 : index
%12 = affine.apply #map3(%11, %arg1)
%13 = linalg.index 1 : index
%14 = affine.apply #map3(%13, %arg2)
%15 = cmpi eq, %12, %14 : index
%16 = select %15, %cst, %cst_0 : f32
linalg.yield %16 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_2 {
flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%workgroup_size_1, %arg3)
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%workgroup_size_0, %arg4)
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%workgroup_size_1, %arg3)
%9 = affine.min #map1(%workgroup_size_0, %arg4)
%10 = affine.min #map4(%arg3, %workgroup_size_1)
%11 = affine.min #map4(%arg4, %workgroup_size_0)
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3, %workgroup_size_1)
%14 = affine.min #map4(%arg4, %workgroup_size_0)
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst, %cst_0 : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_3 {
flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%workgroup_size_1, %arg3)
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%workgroup_size_0, %arg4)
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%workgroup_size_1, %arg3)
%9 = affine.min #map1(%workgroup_size_0, %arg4)
%10 = affine.min #map4(%arg3, %workgroup_size_1)
%11 = affine.min #map4(%arg4, %workgroup_size_0)
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3, %workgroup_size_1)
%14 = affine.min #map4(%arg4, %workgroup_size_0)
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst, %cst_0 : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%18 = mulf %arg5, %arg6 : f32
%19 = addf %18, %arg7 : f32
linalg.yield %19 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c1 = constant 1 : index
%c10 = constant 10 : index
%0 = shapex.make_ranked_shape %c10, %c10 : (index, index) -> !shapex.ranked_shape<[?,?]>
%1 = shapex.ranked_dim %0[0] : !shapex.ranked_shape<[?,?]> -> index
%2 = shapex.ranked_dim %0[1] : !shapex.ranked_shape<[?,?]> -> index
%3 = shapex.make_ranked_shape %c10, %c10 : (index, index) -> !shapex.ranked_shape<[?,?]>
%4 = shapex.ranked_dim %3[0] : !shapex.ranked_shape<[?,?]> -> index
%5 = shapex.ranked_dim %3[1] : !shapex.ranked_shape<[?,?]> -> index
%6:2 = flow.ex.stream.fragment(%c10, %c1) : (index, index) -> (tensor<?x?xf32>{%1, %2}, tensor<?x?xf32>{%4, %5}) =
(%arg0: index, %arg1: index) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
%7 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%arg0, %arg0, %arg1]() : () -> tensor<10x10xf32>
%8 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%arg0, %arg0, %arg1](%7, %7) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%9 = flow.tensor.reshape %8 : tensor<10x10xf32> -> tensor<?x?xf32>{%arg0, %arg0}
%10 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%arg0, %arg0, %arg1](%7, %7) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%11 = flow.tensor.reshape %10 : tensor<10x10xf32> -> tensor<?x?xf32>{%arg0, %arg0}
flow.return %9, %11 : tensor<?x?xf32>, tensor<?x?xf32>
}
check.expect_eq(%6#0, %6#1) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c10 = constant 10 : index
%0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) =
() -> (tensor<?x?xf32>, tensor<?x?xf32>) {
%c10_0 = constant 10 : index
%c1 = constant 1 : index
%1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() : () -> tensor<10x10xf32>
%2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
%4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32>
}
check.expect_eq(%0#0, %0#1) : tensor<?x?xf32>
return
}
// -----// IR Dump After CSE //----- //
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c10 = constant 10 : index
%0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) =
() -> (tensor<?x?xf32>, tensor<?x?xf32>) {
%c10_0 = constant 10 : index
%c1 = constant 1 : index
%1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() : () -> tensor<10x10xf32>
%2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
%4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32>
}
check.expect_eq(%0#0, %0#1) : tensor<?x?xf32>
return
}
// -----// IR Dump After SymbolDCE //----- //
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0, d1) -> (d0, -d1 + 10)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#map3 = affine_map<(d0, d1) -> (d0 + d1)>
#map4 = affine_map<(d0, d1) -> (-d0 + 10, d1)>
#map5 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map6 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map7 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
flow.executable private @matmul_test_dispatch_0 {
flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg1 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %2 to %c10 step %3 {
%4 = affine.min #map1(%workgroup_size_1, %arg1)
%5 = affine.min #map1(%workgroup_size_0, %arg2)
%6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32>
%7 = affine.min #map1(%workgroup_size_1, %arg1)
%8 = affine.min #map1(%workgroup_size_0, %arg2)
%9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32>
%10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%11 = linalg.index 0 : index
%12 = affine.apply #map3(%11, %arg1)
%13 = linalg.index 1 : index
%14 = affine.apply #map3(%13, %arg2)
%15 = cmpi eq, %12, %14 : index
%16 = select %15, %cst, %cst_0 : f32
linalg.yield %16 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_2 {
flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%workgroup_size_1, %arg3)
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%workgroup_size_0, %arg4)
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%workgroup_size_1, %arg3)
%9 = affine.min #map1(%workgroup_size_0, %arg4)
%10 = affine.min #map4(%arg3, %workgroup_size_1)
%11 = affine.min #map4(%arg4, %workgroup_size_0)
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3, %workgroup_size_1)
%14 = affine.min #map4(%arg4, %workgroup_size_0)
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst, %cst_0 : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_3 {
flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%workgroup_size_1, %arg3)
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%workgroup_size_0, %arg4)
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%workgroup_size_1, %arg3)
%9 = affine.min #map1(%workgroup_size_0, %arg4)
%10 = affine.min #map4(%arg3, %workgroup_size_1)
%11 = affine.min #map4(%arg4, %workgroup_size_0)
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3, %workgroup_size_1)
%14 = affine.min #map4(%arg4, %workgroup_size_0)
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst, %cst_0 : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%18 = mulf %arg5, %arg6 : f32
%19 = addf %18, %arg7 : f32
linalg.yield %19 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c10 = constant 10 : index
%0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) =
() -> (tensor<?x?xf32>, tensor<?x?xf32>) {
%c10_0 = constant 10 : index
%c1 = constant 1 : index
%1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() : () -> tensor<10x10xf32>
%2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
%4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32>
}
check.expect_eq(%0#0, %0#1) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After Canonicalizer //----- //
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 10)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#map3 = affine_map<(d0, d1) -> (d0 + d1)>
#map4 = affine_map<(d0)[s0] -> (-d0 + 10, s0)>
#map5 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map6 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map7 = affine_map<(d0, d1, d2) -> (d0, d1)>
module {
flow.executable private @matmul_test_dispatch_0 {
flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg1 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg1)[%workgroup_size_1]
%5 = affine.min #map1(%arg2)[%workgroup_size_0]
%6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32>
%7 = affine.min #map1(%arg1)[%workgroup_size_1]
%8 = affine.min #map1(%arg2)[%workgroup_size_0]
%9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32>
%10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%11 = linalg.index 0 : index
%12 = affine.apply #map3(%11, %arg1)
%13 = linalg.index 1 : index
%14 = affine.apply #map3(%13, %arg2)
%15 = cmpi eq, %12, %14 : index
%16 = select %15, %cst_0, %cst : f32
linalg.yield %16 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_2 {
flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map1(%arg4)[%workgroup_size_0]
%10 = affine.min #map4(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3)[%workgroup_size_1]
%14 = affine.min #map4(%arg4)[%workgroup_size_0]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst_0, %cst : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_3 {
flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map1(%arg4)[%workgroup_size_0]
%10 = affine.min #map4(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3)[%workgroup_size_1]
%14 = affine.min #map4(%arg4)[%workgroup_size_0]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst_0, %cst : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%18 = mulf %arg5, %arg6 : f32
%19 = addf %18, %arg7 : f32
linalg.yield %19 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c10 = constant 10 : index
%0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) =
() -> (tensor<?x?xf32>, tensor<?x?xf32>) {
%c1 = constant 1 : index
%c10_0 = constant 10 : index
%1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() : () -> tensor<10x10xf32>
%2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
%4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32>
}
check.expect_eq(%0#0, %0#1) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::AssignTargetDevicesPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>]}>
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 10)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#map3 = affine_map<(d0, d1) -> (d0 + d1)>
#map4 = affine_map<(d0)[s0] -> (-d0 + 10, s0)>
#map5 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map6 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map7 = affine_map<(d0, d1, d2) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_cpu]} {
flow.executable private @matmul_test_dispatch_0 {
flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg1 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg1)[%workgroup_size_1]
%5 = affine.min #map1(%arg2)[%workgroup_size_0]
%6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32>
%7 = affine.min #map1(%arg1)[%workgroup_size_1]
%8 = affine.min #map1(%arg2)[%workgroup_size_0]
%9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32>
%10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%11 = linalg.index 0 : index
%12 = affine.apply #map3(%11, %arg1)
%13 = linalg.index 1 : index
%14 = affine.apply #map3(%13, %arg2)
%15 = cmpi eq, %12, %14 : index
%16 = select %15, %cst_0, %cst : f32
linalg.yield %16 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_2 {
flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map1(%arg4)[%workgroup_size_0]
%10 = affine.min #map4(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3)[%workgroup_size_1]
%14 = affine.min #map4(%arg4)[%workgroup_size_0]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst_0, %cst : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_3 {
flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map1(%arg4)[%workgroup_size_0]
%10 = affine.min #map4(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3)[%workgroup_size_1]
%14 = affine.min #map4(%arg4)[%workgroup_size_0]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst_0, %cst : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%18 = mulf %arg5, %arg6 : f32
%19 = addf %18, %arg7 : f32
linalg.yield %19 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c10 = constant 10 : index
%0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) =
() -> (tensor<?x?xf32>, tensor<?x?xf32>) {
%c1 = constant 1 : index
%c10_0 = constant 10 : index
%1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() : () -> tensor<10x10xf32>
%2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
%4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32>
}
check.expect_eq(%0#0, %0#1) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::VerifyTargetEnvironmentPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>]}>
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 10)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#map3 = affine_map<(d0, d1) -> (d0 + d1)>
#map4 = affine_map<(d0)[s0] -> (-d0 + 10, s0)>
#map5 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map6 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map7 = affine_map<(d0, d1, d2) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_cpu]} {
flow.executable private @matmul_test_dispatch_0 {
flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg1 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg1)[%workgroup_size_1]
%5 = affine.min #map1(%arg2)[%workgroup_size_0]
%6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32>
%7 = affine.min #map1(%arg1)[%workgroup_size_1]
%8 = affine.min #map1(%arg2)[%workgroup_size_0]
%9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32>
%10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%11 = linalg.index 0 : index
%12 = affine.apply #map3(%11, %arg1)
%13 = linalg.index 1 : index
%14 = affine.apply #map3(%13, %arg2)
%15 = cmpi eq, %12, %14 : index
%16 = select %15, %cst_0, %cst : f32
linalg.yield %16 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_2 {
flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map1(%arg4)[%workgroup_size_0]
%10 = affine.min #map4(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3)[%workgroup_size_1]
%14 = affine.min #map4(%arg4)[%workgroup_size_0]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst_0, %cst : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_3 {
flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map1(%arg4)[%workgroup_size_0]
%10 = affine.min #map4(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3)[%workgroup_size_1]
%14 = affine.min #map4(%arg4)[%workgroup_size_0]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst_0, %cst : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%18 = mulf %arg5, %arg6 : f32
%19 = addf %18, %arg7 : f32
linalg.yield %19 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c10 = constant 10 : index
%0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) =
() -> (tensor<?x?xf32>, tensor<?x?xf32>) {
%c1 = constant 1 : index
%c10_0 = constant 10 : index
%1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() : () -> tensor<10x10xf32>
%2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
%4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32>
}
check.expect_eq(%0#0, %0#1) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::IdentifyConstantPoolsPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>]}>
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 10)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#map3 = affine_map<(d0, d1) -> (d0 + d1)>
#map4 = affine_map<(d0)[s0] -> (-d0 + 10, s0)>
#map5 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map6 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map7 = affine_map<(d0, d1, d2) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_cpu]} {
flow.executable private @matmul_test_dispatch_0 {
flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg1 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg1)[%workgroup_size_1]
%5 = affine.min #map1(%arg2)[%workgroup_size_0]
%6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32>
%7 = affine.min #map1(%arg1)[%workgroup_size_1]
%8 = affine.min #map1(%arg2)[%workgroup_size_0]
%9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32>
%10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%11 = linalg.index 0 : index
%12 = affine.apply #map3(%11, %arg1)
%13 = linalg.index 1 : index
%14 = affine.apply #map3(%13, %arg2)
%15 = cmpi eq, %12, %14 : index
%16 = select %15, %cst_0, %cst : f32
linalg.yield %16 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_2 {
flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map1(%arg4)[%workgroup_size_0]
%10 = affine.min #map4(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3)[%workgroup_size_1]
%14 = affine.min #map4(%arg4)[%workgroup_size_0]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst_0, %cst : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_3 {
flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map1(%arg4)[%workgroup_size_0]
%10 = affine.min #map4(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3)[%workgroup_size_1]
%14 = affine.min #map4(%arg4)[%workgroup_size_0]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst_0, %cst : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%18 = mulf %arg5, %arg6 : f32
%19 = addf %18, %arg7 : f32
linalg.yield %19 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c10 = constant 10 : index
%0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) =
() -> (tensor<?x?xf32>, tensor<?x?xf32>) {
%c1 = constant 1 : index
%c10_0 = constant 10 : index
%1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() : () -> tensor<10x10xf32>
%2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
%4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32>
}
check.expect_eq(%0#0, %0#1) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeConstantPoolBuffersPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>]}>
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 10)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#map3 = affine_map<(d0, d1) -> (d0 + d1)>
#map4 = affine_map<(d0)[s0] -> (-d0 + 10, s0)>
#map5 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map6 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map7 = affine_map<(d0, d1, d2) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_cpu]} {
flow.executable private @matmul_test_dispatch_0 {
flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg1 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg1)[%workgroup_size_1]
%5 = affine.min #map1(%arg2)[%workgroup_size_0]
%6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32>
%7 = affine.min #map1(%arg1)[%workgroup_size_1]
%8 = affine.min #map1(%arg2)[%workgroup_size_0]
%9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32>
%10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%11 = linalg.index 0 : index
%12 = affine.apply #map3(%11, %arg1)
%13 = linalg.index 1 : index
%14 = affine.apply #map3(%13, %arg2)
%15 = cmpi eq, %12, %14 : index
%16 = select %15, %cst_0, %cst : f32
linalg.yield %16 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_2 {
flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map1(%arg4)[%workgroup_size_0]
%10 = affine.min #map4(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3)[%workgroup_size_1]
%14 = affine.min #map4(%arg4)[%workgroup_size_0]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst_0, %cst : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_3 {
flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map1(%arg4)[%workgroup_size_0]
%10 = affine.min #map4(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3)[%workgroup_size_1]
%14 = affine.min #map4(%arg4)[%workgroup_size_0]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst_0, %cst : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%18 = mulf %arg5, %arg6 : f32
%19 = addf %18, %arg7 : f32
linalg.yield %19 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c10 = constant 10 : index
%0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) =
() -> (tensor<?x?xf32>, tensor<?x?xf32>) {
%c1 = constant 1 : index
%c10_0 = constant 10 : index
%1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() : () -> tensor<10x10xf32>
%2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
%4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32>
}
check.expect_eq(%0#0, %0#1) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After Canonicalizer //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>]}>
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 10)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#map3 = affine_map<(d0, d1) -> (d0 + d1)>
#map4 = affine_map<(d0)[s0] -> (-d0 + 10, s0)>
#map5 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map6 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map7 = affine_map<(d0, d1, d2) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_cpu]} {
flow.executable private @matmul_test_dispatch_0 {
flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg1 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg1)[%workgroup_size_1]
%5 = affine.min #map1(%arg2)[%workgroup_size_0]
%6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32>
%7 = affine.min #map1(%arg1)[%workgroup_size_1]
%8 = affine.min #map1(%arg2)[%workgroup_size_0]
%9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32>
%10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%11 = linalg.index 0 : index
%12 = affine.apply #map3(%11, %arg1)
%13 = linalg.index 1 : index
%14 = affine.apply #map3(%13, %arg2)
%15 = cmpi eq, %12, %14 : index
%16 = select %15, %cst, %cst_0 : f32
linalg.yield %16 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_2 {
flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map1(%arg4)[%workgroup_size_0]
%10 = affine.min #map4(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3)[%workgroup_size_1]
%14 = affine.min #map4(%arg4)[%workgroup_size_0]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst, %cst_0 : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_3 {
flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map1(%arg4)[%workgroup_size_0]
%10 = affine.min #map4(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3)[%workgroup_size_1]
%14 = affine.min #map4(%arg4)[%workgroup_size_0]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst, %cst_0 : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%18 = mulf %arg5, %arg6 : f32
%19 = addf %18, %arg7 : f32
linalg.yield %19 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c10 = constant 10 : index
%0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) =
() -> (tensor<?x?xf32>, tensor<?x?xf32>) {
%c10_0 = constant 10 : index
%c1 = constant 1 : index
%1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() : () -> tensor<10x10xf32>
%2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
%4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32>
}
check.expect_eq(%0#0, %0#1) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After SymbolDCE //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>]}>
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 10)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#map3 = affine_map<(d0, d1) -> (d0 + d1)>
#map4 = affine_map<(d0)[s0] -> (-d0 + 10, s0)>
#map5 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map6 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map7 = affine_map<(d0, d1, d2) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_cpu]} {
flow.executable private @matmul_test_dispatch_0 {
flow.dispatch.entry public @matmul_test_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_0(%arg0: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg1 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg2 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg1)[%workgroup_size_1]
%5 = affine.min #map1(%arg2)[%workgroup_size_0]
%6 = linalg.init_tensor [%4, %5] : tensor<?x?xf32>
%7 = affine.min #map1(%arg1)[%workgroup_size_1]
%8 = affine.min #map1(%arg2)[%workgroup_size_0]
%9 = linalg.init_tensor [%7, %8] : tensor<?x?xf32>
%10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%11 = linalg.index 0 : index
%12 = affine.apply #map3(%11, %arg1)
%13 = linalg.index 1 : index
%14 = affine.apply #map3(%13, %arg2)
%15 = cmpi eq, %12, %14 : index
%16 = select %15, %cst, %cst_0 : f32
linalg.yield %16 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %10, %arg0, offsets = [%arg1, %arg2], sizes = [%7, %8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_2 {
flow.dispatch.entry public @matmul_test_dispatch_2 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map1(%arg4)[%workgroup_size_0]
%10 = affine.min #map4(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3)[%workgroup_size_1]
%14 = affine.min #map4(%arg4)[%workgroup_size_0]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst, %cst_0 : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
flow.executable private @matmul_test_dispatch_3 {
flow.dispatch.entry public @matmul_test_dispatch_3 attributes {workgroup_rank = 3 : index}
builtin.module {
func @matmul_test_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:10x10xf32>, %arg1: !flow.dispatch.tensor<readonly:10x10xf32>, %arg2: !flow.dispatch.tensor<writeonly:10x10xf32>) {
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%0 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%1 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg3 = %0 to %c10 step %1 {
%2 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%3 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg4 = %2 to %c10 step %3 {
%4 = affine.min #map1(%arg3)[%workgroup_size_1]
%5 = flow.dispatch.tensor.load %arg0, offsets = [%arg3, 0], sizes = [%4, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%6 = affine.min #map1(%arg4)[%workgroup_size_0]
%7 = flow.dispatch.tensor.load %arg1, offsets = [0, %arg4], sizes = [10, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%8 = affine.min #map1(%arg3)[%workgroup_size_1]
%9 = affine.min #map1(%arg4)[%workgroup_size_0]
%10 = affine.min #map4(%arg3)[%workgroup_size_1]
%11 = affine.min #map4(%arg4)[%workgroup_size_0]
%12 = linalg.init_tensor [%10, %11] : tensor<?x?xf32>
%13 = affine.min #map4(%arg3)[%workgroup_size_1]
%14 = affine.min #map4(%arg4)[%workgroup_size_0]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<?x?xf32>) outs(%15 : tensor<?x?xf32>) {
^bb0(%arg5: f32, %arg6: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply #map3(%18, %arg3)
%20 = linalg.index 1 : index
%21 = affine.apply #map3(%20, %arg4)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst, %cst_0 : f32
linalg.yield %23 : f32
} -> tensor<?x?xf32>
%17 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %7 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%16 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%18 = mulf %arg5, %arg6 : f32
%19 = addf %18, %arg7 : f32
linalg.yield %19 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %17, %arg2, offsets = [%arg3, %arg4], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
}
}
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c10 = constant 10 : index
%0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) =
() -> (tensor<?x?xf32>, tensor<?x?xf32>) {
%c10_0 = constant 10 : index
%c1 = constant 1 : index
%1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() : () -> tensor<10x10xf32>
%2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
%4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32>
}
check.expect_eq(%0#0, %0#1) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeInterfacesPass //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>]}>
#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 10)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#map3 = affine_map<(d0, d1) -> (d0 + d1)>
#map4 = affine_map<(d0)[s0] -> (-d0 + 10, s0)>
#map5 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map6 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map7 = affine_map<(d0, d1, d2) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_cpu]} {
hal.executable private @matmul_test_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @system_elf_x86_64, target = #executable_target_system_elf_x86_64_ {
hal.executable.entry_point public @matmul_test_dispatch_0 attributes {interface = @io, ordinal = 0 : index}
builtin.module {
func @matmul_test_dispatch_0() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y]
%2 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg0 = %1 to %c10 step %2 {
%3 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x]
%4 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg1 = %3 to %c10 step %4 {
%5 = affine.min #map1(%arg0)[%workgroup_size_y]
%6 = affine.min #map1(%arg1)[%workgroup_size_x]
%7 = linalg.init_tensor [%5, %6] : tensor<?x?xf32>
%8 = affine.min #map1(%arg0)[%workgroup_size_y]
%9 = affine.min #map1(%arg1)[%workgroup_size_x]
%10 = linalg.init_tensor [%8, %9] : tensor<?x?xf32>
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<?x?xf32>) outs(%10 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%12 = linalg.index 0 : index
%13 = affine.apply #map3(%12, %arg0)
%14 = linalg.index 1 : index
%15 = affine.apply #map3(%14, %arg1)
%16 = cmpi eq, %13, %15 : index
%17 = select %16, %cst, %cst_0 : f32
linalg.yield %17 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %11, %0, offsets = [%arg0, %arg1], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
}
}
}
}
hal.executable private @matmul_test_dispatch_2 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @system_elf_x86_64, target = #executable_target_system_elf_x86_64_ {
hal.executable.entry_point public @matmul_test_dispatch_2 attributes {interface = @io, ordinal = 0 : index}
builtin.module {
func @matmul_test_dispatch_2() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y]
%4 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x]
%6 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min #map1(%arg0)[%workgroup_size_y]
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%9 = affine.min #map1(%arg1)[%workgroup_size_x]
%10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [10, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%11 = affine.min #map1(%arg0)[%workgroup_size_y]
%12 = affine.min #map1(%arg1)[%workgroup_size_x]
%13 = affine.min #map4(%arg0)[%workgroup_size_y]
%14 = affine.min #map4(%arg1)[%workgroup_size_x]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = affine.min #map4(%arg0)[%workgroup_size_y]
%17 = affine.min #map4(%arg1)[%workgroup_size_x]
%18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32>
%19 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<?x?xf32>) outs(%18 : tensor<?x?xf32>) {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%21 = linalg.index 0 : index
%22 = affine.apply #map3(%21, %arg0)
%23 = linalg.index 1 : index
%24 = affine.apply #map3(%23, %arg1)
%25 = cmpi eq, %22, %24 : index
%26 = select %25, %cst, %cst_0 : f32
linalg.yield %26 : f32
} -> tensor<?x?xf32>
%20 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%19 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %20, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
hal.executable private @matmul_test_dispatch_3 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @system_elf_x86_64, target = #executable_target_system_elf_x86_64_ {
hal.executable.entry_point public @matmul_test_dispatch_3 attributes {interface = @io, ordinal = 0 : index}
builtin.module {
func @matmul_test_dispatch_3() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y]
%4 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x]
%6 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min #map1(%arg0)[%workgroup_size_y]
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%9 = affine.min #map1(%arg1)[%workgroup_size_x]
%10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [10, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%11 = affine.min #map1(%arg0)[%workgroup_size_y]
%12 = affine.min #map1(%arg1)[%workgroup_size_x]
%13 = affine.min #map4(%arg0)[%workgroup_size_y]
%14 = affine.min #map4(%arg1)[%workgroup_size_x]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = affine.min #map4(%arg0)[%workgroup_size_y]
%17 = affine.min #map4(%arg1)[%workgroup_size_x]
%18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32>
%19 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<?x?xf32>) outs(%18 : tensor<?x?xf32>) {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%21 = linalg.index 0 : index
%22 = affine.apply #map3(%21, %arg0)
%23 = linalg.index 1 : index
%24 = affine.apply #map3(%23, %arg1)
%25 = cmpi eq, %22, %24 : index
%26 = select %25, %cst, %cst_0 : f32
linalg.yield %26 : f32
} -> tensor<?x?xf32>
%20 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%8, %10 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%19 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%21 = mulf %arg2, %arg3 : f32
%22 = addf %21, %arg4 : f32
linalg.yield %22 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %20, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c10 = constant 10 : index
%0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) =
() -> (tensor<?x?xf32>, tensor<?x?xf32>) {
%c10_0 = constant 10 : index
%c1 = constant 1 : index
%1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() {hal.bindings = [#hal.ex.result_buffer<"s0b0_xw_external", 0 : index>]} : () -> tensor<10x10xf32>
%2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) {hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>]} : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
%4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) {hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>]} : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32>
}
check.expect_eq(%0#0, %0#1) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After Canonicalizer //----- //
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>]}>
#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 10)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#map3 = affine_map<(d0, d1) -> (d0 + d1)>
#map4 = affine_map<(d0)[s0] -> (-d0 + 10, s0)>
#map5 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map6 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map7 = affine_map<(d0, d1, d2) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_cpu]} {
hal.executable private @matmul_test_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @system_elf_x86_64, target = #executable_target_system_elf_x86_64_ {
hal.executable.entry_point public @matmul_test_dispatch_0 attributes {interface = @io, ordinal = 0 : index}
builtin.module {
func @matmul_test_dispatch_0() {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y]
%2 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg0 = %1 to %c10 step %2 {
%3 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x]
%4 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg1 = %3 to %c10 step %4 {
%5 = affine.min #map1(%arg0)[%workgroup_size_y]
%6 = affine.min #map1(%arg1)[%workgroup_size_x]
%7 = linalg.init_tensor [%5, %6] : tensor<?x?xf32>
%8 = affine.min #map1(%arg0)[%workgroup_size_y]
%9 = affine.min #map1(%arg1)[%workgroup_size_x]
%10 = linalg.init_tensor [%8, %9] : tensor<?x?xf32>
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<?x?xf32>) outs(%10 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%12 = linalg.index 0 : index
%13 = affine.apply #map3(%12, %arg0)
%14 = linalg.index 1 : index
%15 = affine.apply #map3(%14, %arg1)
%16 = cmpi eq, %13, %15 : index
%17 = select %16, %cst_0, %cst : f32
linalg.yield %17 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %11, %0, offsets = [%arg0, %arg1], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
}
}
}
}
hal.executable private @matmul_test_dispatch_2 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @system_elf_x86_64, target = #executable_target_system_elf_x86_64_ {
hal.executable.entry_point public @matmul_test_dispatch_2 attributes {interface = @io, ordinal = 0 : index}
builtin.module {
func @matmul_test_dispatch_2() {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y]
%4 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x]
%6 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min #map1(%arg0)[%workgroup_size_y]
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%9 = affine.min #map1(%arg1)[%workgroup_size_x]
%10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [10, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%11 = affine.min #map1(%arg0)[%workgroup_size_y]
%12 = affine.min #map1(%arg1)[%workgroup_size_x]
%13 = affine.min #map4(%arg0)[%workgroup_size_y]
%14 = affine.min #map4(%arg1)[%workgroup_size_x]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = affine.min #map4(%arg0)[%workgroup_size_y]
%17 = affine.min #map4(%arg1)[%workgroup_size_x]
%18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32>
%19 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<?x?xf32>) outs(%18 : tensor<?x?xf32>) {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%21 = linalg.index 0 : index
%22 = affine.apply #map3(%21, %arg0)
%23 = linalg.index 1 : index
%24 = affine.apply #map3(%23, %arg1)
%25 = cmpi eq, %22, %24 : index
%26 = select %25, %cst_0, %cst : f32
linalg.yield %26 : f32
} -> tensor<?x?xf32>
%20 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%19 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %20, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
hal.executable private @matmul_test_dispatch_3 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @system_elf_x86_64, target = #executable_target_system_elf_x86_64_ {
hal.executable.entry_point public @matmul_test_dispatch_3 attributes {interface = @io, ordinal = 0 : index}
builtin.module {
func @matmul_test_dispatch_3() {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y]
%4 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x]
%6 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min #map1(%arg0)[%workgroup_size_y]
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%9 = affine.min #map1(%arg1)[%workgroup_size_x]
%10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [10, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%11 = affine.min #map1(%arg0)[%workgroup_size_y]
%12 = affine.min #map1(%arg1)[%workgroup_size_x]
%13 = affine.min #map4(%arg0)[%workgroup_size_y]
%14 = affine.min #map4(%arg1)[%workgroup_size_x]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = affine.min #map4(%arg0)[%workgroup_size_y]
%17 = affine.min #map4(%arg1)[%workgroup_size_x]
%18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32>
%19 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<?x?xf32>) outs(%18 : tensor<?x?xf32>) {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%21 = linalg.index 0 : index
%22 = affine.apply #map3(%21, %arg0)
%23 = linalg.index 1 : index
%24 = affine.apply #map3(%23, %arg1)
%25 = cmpi eq, %22, %24 : index
%26 = select %25, %cst_0, %cst : f32
linalg.yield %26 : f32
} -> tensor<?x?xf32>
%20 = linalg.generic {indexing_maps = [#map5, #map6, #map7], iterator_types = ["parallel", "parallel", "reduction"]} ins(%8, %10 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%19 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%21 = mulf %arg2, %arg3 : f32
%22 = addf %21, %arg4 : f32
linalg.yield %22 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %20, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @matmul_test() attributes {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}} {
%c10 = constant 10 : index
%0:2 = flow.ex.stream.fragment() : () -> (tensor<?x?xf32>{%c10, %c10}, tensor<?x?xf32>{%c10, %c10}) =
() -> (tensor<?x?xf32>, tensor<?x?xf32>) {
%c1 = constant 1 : index
%c10_0 = constant 10 : index
%1 = flow.dispatch @matmul_test_dispatch_0::@matmul_test_dispatch_0[%c10_0, %c10_0, %c1]() {hal.bindings = [#hal.ex.result_buffer<"s0b0_xw_external", 0 : index>]} : () -> tensor<10x10xf32>
%2 = flow.dispatch @matmul_test_dispatch_2::@matmul_test_dispatch_2[%c10_0, %c10_0, %c1](%1, %1) {hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>]} : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%3 = flow.tensor.reshape %2 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
%4 = flow.dispatch @matmul_test_dispatch_3::@matmul_test_dispatch_3[%c10_0, %c10_0, %c1](%1, %1) {hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>]} : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%5 = flow.tensor.reshape %4 : tensor<10x10xf32> -> tensor<?x?xf32>{%c10_0, %c10_0}
flow.return %3, %5 : tensor<?x?xf32>, tensor<?x?xf32>
}
check.expect_eq(%0#0, %0#1) : tensor<?x?xf32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::PropagateConstantWorkgroupInfoPass //----- //
hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> {
hal.executable.entry_point public @matmul_test_dispatch_0 attributes {interface = @io, ordinal = 0 : index}
builtin.module {
func @matmul_test_dispatch_0() {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg0 = %1 to %c10 step %2 {
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg1 = %3 to %c10 step %4 {
%5 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg0)[%workgroup_size_y]
%6 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg1)[%workgroup_size_x]
%7 = linalg.init_tensor [%5, %6] : tensor<?x?xf32>
%8 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg0)[%workgroup_size_y]
%9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg1)[%workgroup_size_x]
%10 = linalg.init_tensor [%8, %9] : tensor<?x?xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<?x?xf32>) outs(%10 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%12 = linalg.index 0 : index
%13 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%12, %arg0)
%14 = linalg.index 1 : index
%15 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%14, %arg1)
%16 = cmpi eq, %13, %15 : index
%17 = select %16, %cst_0, %cst : f32
linalg.yield %17 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %11, %0, offsets = [%arg0, %arg1], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::PropagateConstantWorkgroupInfoPass //----- //
hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> {
hal.executable.entry_point public @matmul_test_dispatch_2 attributes {interface = @io, ordinal = 0 : index}
builtin.module {
func @matmul_test_dispatch_2() {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg0)[%workgroup_size_y]
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg1)[%workgroup_size_x]
%10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [10, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg0)[%workgroup_size_y]
%12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg1)[%workgroup_size_x]
%13 = affine.min affine_map<(d0)[s0] -> (-d0 + 10, s0)>(%arg0)[%workgroup_size_y]
%14 = affine.min affine_map<(d0)[s0] -> (-d0 + 10, s0)>(%arg1)[%workgroup_size_x]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 10, s0)>(%arg0)[%workgroup_size_y]
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 10, s0)>(%arg1)[%workgroup_size_x]
%18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<?x?xf32>) outs(%18 : tensor<?x?xf32>) {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%21 = linalg.index 0 : index
%22 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%21, %arg0)
%23 = linalg.index 1 : index
%24 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%23, %arg1)
%25 = cmpi eq, %22, %24 : index
%26 = select %25, %cst_0, %cst : f32
linalg.yield %26 : f32
} -> tensor<?x?xf32>
%20 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%19 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %20, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::PropagateConstantWorkgroupInfoPass //----- //
hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> {
hal.executable.entry_point public @matmul_test_dispatch_3 attributes {interface = @io, ordinal = 0 : index}
builtin.module {
func @matmul_test_dispatch_3() {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg0)[%workgroup_size_y]
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg1)[%workgroup_size_x]
%10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [10, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg0)[%workgroup_size_y]
%12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 10)>(%arg1)[%workgroup_size_x]
%13 = affine.min affine_map<(d0)[s0] -> (-d0 + 10, s0)>(%arg0)[%workgroup_size_y]
%14 = affine.min affine_map<(d0)[s0] -> (-d0 + 10, s0)>(%arg1)[%workgroup_size_x]
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 10, s0)>(%arg0)[%workgroup_size_y]
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 10, s0)>(%arg1)[%workgroup_size_x]
%18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<?x?xf32>) outs(%18 : tensor<?x?xf32>) {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%21 = linalg.index 0 : index
%22 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%21, %arg0)
%23 = linalg.index 1 : index
%24 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%23, %arg1)
%25 = cmpi eq, %22, %24 : index
%26 = select %25, %cst_0, %cst : f32
linalg.yield %26 : f32
} -> tensor<?x?xf32>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%8, %10 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%19 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%21 = mulf %arg2, %arg3 : f32
%22 = addf %21, %arg4 : f32
linalg.yield %22 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %20, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After LLVMCPULowerExecutableTarget Failed //----- //
"hal.executable.variant"() ( {
"hal.executable.entry_point"() {interface = @io, ordinal = 0 : index, sym_name = "matmul_test_dispatch_3", translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} : () -> ()
"builtin.module"() ( {
"builtin.func"() ( {
%0 = "std.constant"() {value = 1.000000e+00 : f32} : () -> f32
%1 = "std.constant"() {value = 0.000000e+00 : f32} : () -> f32
%2 = "std.constant"() {value = 10 : index} : () -> index
%3 = "std.constant"() {value = 0 : index} : () -> index
%4 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b0_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32>
%5 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b1_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32>
%6 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b2_xw_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<writeonly:10x10xf32>
%7 = "hal.interface.workgroup.size"() {dimension = 0 : index} : () -> index
%8 = "hal.interface.workgroup.size"() {dimension = 1 : index} : () -> index
%9 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
%10 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
%11 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
%12 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index
%13 = "affine.apply"(%11, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
%14 = "affine.apply"(%12, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
"scf.for"(%13, %2, %14) ( {
^bb0(%arg0: index): // no predecessors
%15 = "affine.apply"(%9, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
%16 = "affine.apply"(%10, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
"scf.for"(%15, %2, %16) ( {
^bb0(%arg1: index): // no predecessors
%17 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%18 = "flow.dispatch.tensor.load"(%4, %arg0, %17) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [-9223372036854775808, 0], static_sizes = [-1, 10], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<?x10xf32>
%19 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%20 = "flow.dispatch.tensor.load"(%5, %arg1, %19) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [0, -9223372036854775808], static_sizes = [10, -1], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<10x?xf32>
%21 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%22 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%23 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%24 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%25 = "linalg.init_tensor"(%23, %24) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32>
%26 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%27 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%28 = "linalg.init_tensor"(%26, %27) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32>
%29 = "linalg.generic"(%25, %28) ( {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%31 = "linalg.index"() {dim = 0 : i64} : () -> index
%32 = "affine.apply"(%31, %arg0) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index
%33 = "linalg.index"() {dim = 1 : i64} : () -> index
%34 = "affine.apply"(%33, %arg1) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index
%35 = "std.cmpi"(%32, %34) {predicate = 0 : i64} : (index, index) -> i1
%36 = "std.select"(%35, %1, %0) : (i1, f32, f32) -> f32
"linalg.yield"(%36) : (f32) -> ()
}) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
%30 = "linalg.generic"(%18, %20, %29) ( {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%31 = "std.mulf"(%arg2, %arg3) : (f32, f32) -> f32
%32 = "std.addf"(%31, %arg4) : (f32, f32) -> f32
"linalg.yield"(%32) : (f32) -> ()
}) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x10xf32>, tensor<10x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
"flow.dispatch.tensor.store"(%30, %6, %arg0, %arg1, %21, %22) {operand_segment_sizes = dense<[1, 1, 2, 2, 0]> : vector<5xi32>, static_offsets = [-9223372036854775808, -9223372036854775808], static_sizes = [-1, -1], static_strides = [1, 1]} : (tensor<?x?xf32>, !flow.dispatch.tensor<writeonly:10x10xf32>, index, index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"std.return"() : () -> ()
}) {sym_name = "matmul_test_dispatch_3", type = () -> ()} : () -> ()
"hal.interface"() ( {
"hal.interface.binding"() {access = 1 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_ro_external", type = 7 : i32} : () -> ()
"hal.interface.binding"() {access = 1 : i32, binding = 1 : index, set = 0 : index, sym_name = "s0b1_ro_external", type = 7 : i32} : () -> ()
"hal.interface.binding"() {access = 6 : i32, binding = 2 : index, set = 0 : index, sym_name = "s0b2_xw_external", type = 7 : i32} : () -> ()
"hal.interface_end"() : () -> ()
}) {sym_name = "io", sym_visibility = "private"} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "system_elf_x86_64", target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>} : () -> ()
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateTargetExecutableVariantsPass Failed //----- //
"hal.executable.variant"() ( {
"hal.executable.entry_point"() {interface = @io, ordinal = 0 : index, sym_name = "matmul_test_dispatch_3", translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} : () -> ()
"builtin.module"() ( {
"builtin.func"() ( {
%0 = "std.constant"() {value = 1.000000e+00 : f32} : () -> f32
%1 = "std.constant"() {value = 0.000000e+00 : f32} : () -> f32
%2 = "std.constant"() {value = 10 : index} : () -> index
%3 = "std.constant"() {value = 0 : index} : () -> index
%4 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b0_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32>
%5 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b1_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32>
%6 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b2_xw_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<writeonly:10x10xf32>
%7 = "hal.interface.workgroup.size"() {dimension = 0 : index} : () -> index
%8 = "hal.interface.workgroup.size"() {dimension = 1 : index} : () -> index
%9 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
%10 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
%11 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
%12 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index
%13 = "affine.apply"(%11, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
%14 = "affine.apply"(%12, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
"scf.for"(%13, %2, %14) ( {
^bb0(%arg0: index): // no predecessors
%15 = "affine.apply"(%9, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
%16 = "affine.apply"(%10, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
"scf.for"(%15, %2, %16) ( {
^bb0(%arg1: index): // no predecessors
%17 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%18 = "flow.dispatch.tensor.load"(%4, %arg0, %17) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [-9223372036854775808, 0], static_sizes = [-1, 10], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<?x10xf32>
%19 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%20 = "flow.dispatch.tensor.load"(%5, %arg1, %19) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [0, -9223372036854775808], static_sizes = [10, -1], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<10x?xf32>
%21 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%22 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%23 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%24 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%25 = "linalg.init_tensor"(%23, %24) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32>
%26 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%27 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%28 = "linalg.init_tensor"(%26, %27) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32>
%29 = "linalg.generic"(%25, %28) ( {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%31 = "linalg.index"() {dim = 0 : i64} : () -> index
%32 = "affine.apply"(%31, %arg0) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index
%33 = "linalg.index"() {dim = 1 : i64} : () -> index
%34 = "affine.apply"(%33, %arg1) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index
%35 = "std.cmpi"(%32, %34) {predicate = 0 : i64} : (index, index) -> i1
%36 = "std.select"(%35, %1, %0) : (i1, f32, f32) -> f32
"linalg.yield"(%36) : (f32) -> ()
}) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
%30 = "linalg.generic"(%18, %20, %29) ( {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%31 = "std.mulf"(%arg2, %arg3) : (f32, f32) -> f32
%32 = "std.addf"(%31, %arg4) : (f32, f32) -> f32
"linalg.yield"(%32) : (f32) -> ()
}) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x10xf32>, tensor<10x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
"flow.dispatch.tensor.store"(%30, %6, %arg0, %arg1, %21, %22) {operand_segment_sizes = dense<[1, 1, 2, 2, 0]> : vector<5xi32>, static_offsets = [-9223372036854775808, -9223372036854775808], static_sizes = [-1, -1], static_strides = [1, 1]} : (tensor<?x?xf32>, !flow.dispatch.tensor<writeonly:10x10xf32>, index, index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"std.return"() : () -> ()
}) {sym_name = "matmul_test_dispatch_3", type = () -> ()} : () -> ()
"hal.interface"() ( {
"hal.interface.binding"() {access = 1 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_ro_external", type = 7 : i32} : () -> ()
"hal.interface.binding"() {access = 1 : i32, binding = 1 : index, set = 0 : index, sym_name = "s0b1_ro_external", type = 7 : i32} : () -> ()
"hal.interface.binding"() {access = 6 : i32, binding = 2 : index, set = 0 : index, sym_name = "s0b2_xw_external", type = 7 : i32} : () -> ()
"hal.interface_end"() : () -> ()
}) {sym_name = "io", sym_visibility = "private"} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "system_elf_x86_64", target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>} : () -> ()
// -----// IR Dump After SetNumWorkgroups //----- //
hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> {
hal.executable.entry_point public @matmul_test_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg0]
%1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1]
hal.return %0, %1, %c1 : index, index, index
}
builtin.module {
func @matmul_test_dispatch_0() {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%c64 = constant 64 : index
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %c64]
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %c64]
scf.for %arg0 = %1 to %c10 step %2 {
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %c64]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %c64]
scf.for %arg1 = %3 to %c10 step %4 {
%5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%7 = linalg.init_tensor [%5, %6] : tensor<?x?xf32>
%8 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = linalg.init_tensor [%8, %9] : tensor<?x?xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<?x?xf32>) outs(%10 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%12 = linalg.index 0 : index
%13 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%12, %arg0)
%14 = linalg.index 1 : index
%15 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%14, %arg1)
%16 = cmpi eq, %13, %15 : index
%17 = select %16, %cst_0, %cst : f32
linalg.yield %17 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %11, %0, offsets = [%arg0, %arg1], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After SetNumWorkgroups //----- //
hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> {
hal.executable.entry_point public @matmul_test_dispatch_2 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg0]
%1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1]
hal.return %0, %1, %c1 : index, index, index
}
builtin.module {
func @matmul_test_dispatch_2() {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%c64 = constant 64 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %c64]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %c64]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %c64]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %c64]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [10, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%11 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%12 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%13 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%14 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%17 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<?x?xf32>) outs(%18 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%21 = linalg.index 0 : index
%22 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%21, %arg0)
%23 = linalg.index 1 : index
%24 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%23, %arg1)
%25 = cmpi eq, %22, %24 : index
%26 = select %25, %cst_0, %cst : f32
linalg.yield %26 : f32
} -> tensor<?x?xf32>
%20 = linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%8, %10 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%19 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %20, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass Failed //----- //
"hal.executable"() ( {
"hal.interface"() ( {
"hal.interface.binding"() {access = 1 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_ro_external", type = 7 : i32} : () -> ()
"hal.interface.binding"() {access = 1 : i32, binding = 1 : index, set = 0 : index, sym_name = "s0b1_ro_external", type = 7 : i32} : () -> ()
"hal.interface.binding"() {access = 6 : i32, binding = 2 : index, set = 0 : index, sym_name = "s0b2_xw_external", type = 7 : i32} : () -> ()
"hal.interface_end"() : () -> ()
}) {sym_name = "io"} : () -> ()
"hal.executable.variant"() ( {
"hal.executable.entry_point"() {interface = @io, ordinal = 0 : index, sym_name = "matmul_test_dispatch_3", translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} : () -> ()
"builtin.module"() ( {
"builtin.func"() ( {
%0 = "std.constant"() {value = 1.000000e+00 : f32} : () -> f32
%1 = "std.constant"() {value = 0.000000e+00 : f32} : () -> f32
%2 = "std.constant"() {value = 10 : index} : () -> index
%3 = "std.constant"() {value = 0 : index} : () -> index
%4 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b0_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32>
%5 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b1_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32>
%6 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b2_xw_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<writeonly:10x10xf32>
%7 = "hal.interface.workgroup.size"() {dimension = 0 : index} : () -> index
%8 = "hal.interface.workgroup.size"() {dimension = 1 : index} : () -> index
%9 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
%10 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
%11 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
%12 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index
%13 = "affine.apply"(%11, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
%14 = "affine.apply"(%12, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
"scf.for"(%13, %2, %14) ( {
^bb0(%arg0: index): // no predecessors
%15 = "affine.apply"(%9, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
%16 = "affine.apply"(%10, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
"scf.for"(%15, %2, %16) ( {
^bb0(%arg1: index): // no predecessors
%17 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%18 = "flow.dispatch.tensor.load"(%4, %arg0, %17) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [-9223372036854775808, 0], static_sizes = [-1, 10], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<?x10xf32>
%19 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%20 = "flow.dispatch.tensor.load"(%5, %arg1, %19) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [0, -9223372036854775808], static_sizes = [10, -1], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<10x?xf32>
%21 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%22 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%23 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%24 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%25 = "linalg.init_tensor"(%23, %24) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32>
%26 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%27 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%28 = "linalg.init_tensor"(%26, %27) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32>
%29 = "linalg.generic"(%25, %28) ( {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%31 = "linalg.index"() {dim = 0 : i64} : () -> index
%32 = "affine.apply"(%31, %arg0) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index
%33 = "linalg.index"() {dim = 1 : i64} : () -> index
%34 = "affine.apply"(%33, %arg1) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index
%35 = "std.cmpi"(%32, %34) {predicate = 0 : i64} : (index, index) -> i1
%36 = "std.select"(%35, %1, %0) : (i1, f32, f32) -> f32
"linalg.yield"(%36) : (f32) -> ()
}) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
%30 = "linalg.generic"(%18, %20, %29) ( {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%31 = "std.mulf"(%arg2, %arg3) : (f32, f32) -> f32
%32 = "std.addf"(%31, %arg4) : (f32, f32) -> f32
"linalg.yield"(%32) : (f32) -> ()
}) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x10xf32>, tensor<10x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
"flow.dispatch.tensor.store"(%30, %6, %arg0, %arg1, %21, %22) {operand_segment_sizes = dense<[1, 1, 2, 2, 0]> : vector<5xi32>, static_offsets = [-9223372036854775808, -9223372036854775808], static_sizes = [-1, -1], static_strides = [1, 1]} : (tensor<?x?xf32>, !flow.dispatch.tensor<writeonly:10x10xf32>, index, index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"std.return"() : () -> ()
}) {sym_name = "matmul_test_dispatch_3", type = () -> ()} : () -> ()
"hal.interface"() ( {
"hal.interface.binding"() {access = 1 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_ro_external", type = 7 : i32} : () -> ()
"hal.interface.binding"() {access = 1 : i32, binding = 1 : index, set = 0 : index, sym_name = "s0b1_ro_external", type = 7 : i32} : () -> ()
"hal.interface.binding"() {access = 6 : i32, binding = 2 : index, set = 0 : index, sym_name = "s0b2_xw_external", type = 7 : i32} : () -> ()
"hal.interface_end"() : () -> ()
}) {sym_name = "io", sym_visibility = "private"} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "system_elf_x86_64", target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>} : () -> ()
"hal.executable_end"() : () -> ()
}) {sym_name = "matmul_test_dispatch_3", sym_visibility = "private"} : () -> ()
// -----// IR Dump After Canonicalizer //----- //
hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> {
hal.executable.entry_point public @matmul_test_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg0]
%1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1]
hal.return %0, %1, %c1 : index, index, index
}
builtin.module {
func @matmul_test_dispatch_0() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %1 to %c10 step %2 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %3 to %c10 step %4 {
%5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%7 = linalg.init_tensor [%5, %6] : tensor<?x?xf32>
%8 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = linalg.init_tensor [%8, %9] : tensor<?x?xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<?x?xf32>) outs(%10 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%12 = linalg.index 0 : index
%13 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%12, %arg0)
%14 = linalg.index 1 : index
%15 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%14, %arg1)
%16 = cmpi eq, %13, %15 : index
%17 = select %16, %cst, %cst_0 : f32
linalg.yield %17 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %11, %0, offsets = [%arg0, %arg1], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After Canonicalizer //----- //
hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> {
hal.executable.entry_point public @matmul_test_dispatch_2 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg0]
%1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1]
hal.return %0, %1, %c1 : index, index, index
}
builtin.module {
func @matmul_test_dispatch_2() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [10, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%11 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%12 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%13 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%14 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%17 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<?x?xf32>) outs(%18 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%21 = linalg.index 0 : index
%22 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%21, %arg0)
%23 = linalg.index 1 : index
%24 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%23, %arg1)
%25 = cmpi eq, %22, %24 : index
%26 = select %25, %cst, %cst_0 : f32
linalg.yield %26 : f32
} -> tensor<?x?xf32>
%20 = linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%8, %10 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%19 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %20, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After Canonicalizer //----- //
module {
func @matmul_test_dispatch_0() {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %1 to %c10 step %2 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %3 to %c10 step %4 {
%5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%7 = linalg.init_tensor [%5, %6] : tensor<?x?xf32>
%8 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = linalg.init_tensor [%8, %9] : tensor<?x?xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<?x?xf32>) outs(%10 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%12 = linalg.index 0 : index
%13 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%12, %arg0)
%14 = linalg.index 1 : index
%15 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%14, %arg1)
%16 = cmpi eq, %13, %15 : index
%17 = select %16, %cst_0, %cst : f32
linalg.yield %17 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %11, %0, offsets = [%arg0, %arg1], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After Canonicalizer //----- //
module {
func @matmul_test_dispatch_2() {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [10, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%11 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%12 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%13 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%14 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
%16 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%17 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%18 = linalg.init_tensor [%16, %17] : tensor<?x?xf32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<?x?xf32>) outs(%18 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%21 = linalg.index 0 : index
%22 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%21, %arg0)
%23 = linalg.index 1 : index
%24 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%23, %arg1)
%25 = cmpi eq, %22, %24 : index
%26 = select %25, %cst_0, %cst : f32
linalg.yield %26 : f32
} -> tensor<?x?xf32>
%20 = linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%8, %10 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%19 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %20, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:10x10xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After LinalgBufferize //----- //
func @matmul_test_dispatch_0() {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %2 to %c10 step %3 {
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %4 to %c10 step %5 {
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%8 = linalg.init_tensor [%6, %7] : tensor<?x?xf32>
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%10 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%11 = linalg.init_tensor [%9, %10] : tensor<?x?xf32>
%12 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%13 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%14 = memref.subview %0[%arg0, %arg1] [%12, %13] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%15 = memref.alloca(%6, %7) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%15 : memref<?x?xf32>) outs(%14 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%17 = linalg.index 0 : index
%18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg0)
%19 = linalg.index 1 : index
%20 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%19, %arg1)
%21 = cmpi eq, %18, %20 : index
%22 = select %21, %cst_0, %cst : f32
linalg.yield %22 : f32
}
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%11 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%17 = linalg.index 0 : index
%18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg0)
%19 = linalg.index 1 : index
%20 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%19, %arg1)
%21 = cmpi eq, %18, %20 : index
%22 = select %21, %cst_0, %cst : f32
linalg.yield %22 : f32
} -> tensor<?x?xf32>
}
}
return
}
// -----// IR Dump After LinalgBufferize //----- //
func @matmul_test_dispatch_2() {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %6 to %c10 step %7 {
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%9 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %8 to %c10 step %9 {
%10 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%11 = memref.subview %0[%arg0, 0] [%10, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%12 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0], sizes = [%10, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<?x10xf32>
%13 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%14 = memref.subview %2[0, %arg1] [10, %13] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%15 = flow.dispatch.tensor.load %3, offsets = [0, %arg1], sizes = [10, %13], strides = [1, 1] : !flow.dispatch.tensor<readonly:10x10xf32> -> tensor<10x?xf32>
%16 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%17 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%18 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%19 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%20 = linalg.init_tensor [%18, %19] : tensor<?x?xf32>
%21 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%22 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%23 = linalg.init_tensor [%21, %22] : tensor<?x?xf32>
%24 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%25 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%26 = memref.subview %4[%arg0, %arg1] [%24, %25] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%27 = memref.alloca(%18, %19) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%27 : memref<?x?xf32>) outs(%26 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%30 = linalg.index 0 : index
%31 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%30, %arg0)
%32 = linalg.index 1 : index
%33 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%32, %arg1)
%34 = cmpi eq, %31, %33 : index
%35 = select %34, %cst_0, %cst : f32
linalg.yield %35 : f32
}
%28 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%20 : tensor<?x?xf32>) outs(%23 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%30 = linalg.index 0 : index
%31 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%30, %arg0)
%32 = linalg.index 1 : index
%33 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%32, %arg1)
%34 = cmpi eq, %31, %33 : index
%35 = select %34, %cst_0, %cst : f32
linalg.yield %35 : f32
} -> tensor<?x?xf32>
linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%11, %14 : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%26 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>)
%29 = linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%12, %15 : tensor<?x10xf32>, tensor<10x?xf32>) outs(%28 : tensor<?x?xf32>) -> tensor<?x?xf32>
}
}
return
}
// -----// IR Dump After ResolveShapedTypeResultDims //----- //
module {
func @matmul_test_dispatch_0() {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %2 to %c10 step %3 {
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %4 to %c10 step %5 {
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%8 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = memref.subview %0[%arg0, %arg1] [%8, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%11 = memref.alloca(%6, %7) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : memref<?x?xf32>) outs(%10 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%12 = linalg.index 0 : index
%13 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%12, %arg0)
%14 = linalg.index 1 : index
%15 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%14, %arg1)
%16 = cmpi eq, %13, %15 : index
%17 = select %16, %cst_0, %cst : f32
linalg.yield %17 : f32
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After ResolveShapedTypeResultDims //----- //
module {
func @matmul_test_dispatch_2() {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %6 to %c10 step %7 {
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%9 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %8 to %c10 step %9 {
%10 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%11 = memref.subview %0[%arg0, 0] [%10, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%12 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%13 = memref.subview %2[0, %arg1] [10, %12] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%14 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%15 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%16 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%17 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%18 = memref.subview %4[%arg0, %arg1] [%16, %17] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%19 = memref.alloca(%14, %15) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%19 : memref<?x?xf32>) outs(%18 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%20 = linalg.index 0 : index
%21 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%20, %arg0)
%22 = linalg.index 1 : index
%23 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%22, %arg1)
%24 = cmpi eq, %21, %23 : index
%25 = select %24, %cst_0, %cst : f32
linalg.yield %25 : f32
}
linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%11, %13 : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%18 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>)
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test_dispatch_0() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %2 to %c10 step %3 {
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %4 to %c10 step %5 {
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%8 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = memref.subview %0[%arg0, %arg1] [%8, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%11 = memref.alloca(%6, %7) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : memref<?x?xf32>) outs(%10 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%12 = linalg.index 0 : index
%13 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%12, %arg0)
%14 = linalg.index 1 : index
%15 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%14, %arg1)
%16 = cmpi eq, %13, %15 : index
%17 = select %16, %cst, %cst_0 : f32
linalg.yield %17 : f32
}
}
}
return
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test_dispatch_2() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %6 to %c10 step %7 {
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%9 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %8 to %c10 step %9 {
%10 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%11 = memref.subview %0[%arg0, 0] [%10, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%12 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%13 = memref.subview %2[0, %arg1] [10, %12] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%14 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%15 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%16 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%17 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%18 = memref.subview %4[%arg0, %arg1] [%16, %17] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%19 = memref.alloca(%14, %15) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%19 : memref<?x?xf32>) outs(%18 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%20 = linalg.index 0 : index
%21 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%20, %arg0)
%22 = linalg.index 1 : index
%23 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%22, %arg1)
%24 = cmpi eq, %21, %23 : index
%25 = select %24, %cst, %cst_0 : f32
linalg.yield %25 : f32
}
linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%11, %13 : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%18 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>)
}
}
return
}
// -----// IR Dump After CSE //----- //
func @matmul_test_dispatch_0() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %2 to %c10 step %3 {
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %4 to %c10 step %5 {
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%8 = memref.subview %0[%arg0, %arg1] [%6, %7] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%9 = memref.alloca(%6, %7) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : memref<?x?xf32>) outs(%8 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%10 = linalg.index 0 : index
%11 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%10, %arg0)
%12 = linalg.index 1 : index
%13 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%12, %arg1)
%14 = cmpi eq, %11, %13 : index
%15 = select %14, %cst, %cst_0 : f32
linalg.yield %15 : f32
}
}
}
return
}
// -----// IR Dump After CSE //----- //
func @matmul_test_dispatch_2() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:10x10xf32>
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %6 to %c10 step %7 {
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%9 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %8 to %c10 step %9 {
%10 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%11 = memref.subview %0[%arg0, 0] [%10, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%12 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%13 = memref.subview %2[0, %arg1] [10, %12] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%14 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%15 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%16 = memref.subview %4[%arg0, %arg1] [%10, %12] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%17 = memref.alloca(%14, %15) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%17 : memref<?x?xf32>) outs(%16 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%18 = linalg.index 0 : index
%19 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%18, %arg0)
%20 = linalg.index 1 : index
%21 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%20, %arg1)
%22 = cmpi eq, %19, %21 : index
%23 = select %22, %cst, %cst_0 : f32
linalg.yield %23 : f32
}
linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%11, %13 : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%16 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>)
}
}
return
}
// -----// IR Dump After CleanupBufferAllocView //----- //
func @matmul_test_dispatch_0() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %1 to %c10 step %2 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %3 to %c10 step %4 {
%5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%8 = memref.alloca(%5, %6) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<?x?xf32>) outs(%7 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%9 = linalg.index 0 : index
%10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %arg0)
%11 = linalg.index 1 : index
%12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %arg1)
%13 = cmpi eq, %10, %12 : index
%14 = select %13, %cst, %cst_0 : f32
linalg.yield %14 : f32
}
}
}
return
}
// -----// IR Dump After CSE //----- //
func @matmul_test_dispatch_0() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %1 to %c10 step %2 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %3 to %c10 step %4 {
%5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%8 = memref.alloca(%5, %6) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<?x?xf32>) outs(%7 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%9 = linalg.index 0 : index
%10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %arg0)
%11 = linalg.index 1 : index
%12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %arg1)
%13 = cmpi eq, %10, %12 : index
%14 = select %13, %cst, %cst_0 : f32
linalg.yield %14 : f32
}
}
}
return
}
// -----// IR Dump After CleanupBufferAllocView //----- //
func @matmul_test_dispatch_2() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%14 = memref.alloca(%11, %12) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14 : memref<?x?xf32>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%15 = linalg.index 0 : index
%16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%15, %arg0)
%17 = linalg.index 1 : index
%18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1)
%19 = cmpi eq, %16, %18 : index
%20 = select %19, %cst, %cst_0 : f32
linalg.yield %20 : f32
}
linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%8, %10 : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>)
}
}
return
}
// -----// IR Dump After CSE //----- //
func @matmul_test_dispatch_2() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%14 = memref.alloca(%11, %12) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14 : memref<?x?xf32>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%15 = linalg.index 0 : index
%16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%15, %arg0)
%17 = linalg.index 1 : index
%18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1)
%19 = cmpi eq, %16, %18 : index
%20 = select %19, %cst, %cst_0 : f32
linalg.yield %20 : f32
}
linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%8, %10 : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>)
}
}
return
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test_dispatch_0() {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %1 to %c10 step %2 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %3 to %c10 step %4 {
%5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%8 = memref.alloca(%5, %6) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<?x?xf32>) outs(%7 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%9 = linalg.index 0 : index
%10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %arg0)
%11 = linalg.index 1 : index
%12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %arg1)
%13 = cmpi eq, %10, %12 : index
%14 = select %13, %cst_0, %cst : f32
linalg.yield %14 : f32
}
}
}
return
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test_dispatch_2() {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%14 = memref.alloca(%11, %12) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14 : memref<?x?xf32>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%15 = linalg.index 0 : index
%16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%15, %arg0)
%17 = linalg.index 1 : index
%18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1)
%19 = cmpi eq, %16, %18 : index
%20 = select %19, %cst_0, %cst : f32
linalg.yield %20 : f32
}
linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%8, %10 : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>)
}
}
return
}
// -----// IR Dump After LLVMCPUVectorization //----- //
func @matmul_test_dispatch_0() {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %1 to %c10 step %2 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %3 to %c10 step %4 {
%5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%8 = memref.alloca(%5, %6) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<?x?xf32>) outs(%7 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%9 = linalg.index 0 : index
%10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %arg0)
%11 = linalg.index 1 : index
%12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %arg1)
%13 = cmpi eq, %10, %12 : index
%14 = select %13, %cst_0, %cst : f32
linalg.yield %14 : f32
}
}
}
return
}
// -----// IR Dump After CSE //----- //
func @matmul_test_dispatch_0() {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %1 to %c10 step %2 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %3 to %c10 step %4 {
%5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%8 = memref.alloca(%5, %6) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<?x?xf32>) outs(%7 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%9 = linalg.index 0 : index
%10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %arg0)
%11 = linalg.index 1 : index
%12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %arg1)
%13 = cmpi eq, %10, %12 : index
%14 = select %13, %cst_0, %cst : f32
linalg.yield %14 : f32
}
}
}
return
}
// -----// IR Dump After LLVMCPUVectorization //----- //
func @matmul_test_dispatch_2() {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%c32 = constant 32 : index
%c4 = constant 4 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%14 = memref.alloca(%11, %12) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14 : memref<?x?xf32>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%15 = linalg.index 0 : index
%16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%15, %arg0)
%17 = linalg.index 1 : index
%18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1)
%19 = cmpi eq, %16, %18 : index
%20 = select %19, %cst_0, %cst : f32
linalg.yield %20 : f32
}
scf.for %arg2 = %c0 to %7 step %c32 {
scf.for %arg3 = %c0 to %9 step %c32 {
%15 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2)
%16 = memref.subview %8[%arg2, 0] [%15, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%17 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3)
%18 = memref.subview %10[0, %arg3] [10, %17] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%19 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2)
%20 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3)
%21 = memref.subview %13[%arg2, %arg3] [%19, %20] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg4 = %c0 to %15 step %c4 {
scf.for %arg5 = %c0 to %17 step %c4 {
scf.for %arg6 = %c0 to %c10 step %c4 {
%22 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%15, %arg4)
%23 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6)
%24 = memref.subview %16[%arg4, %arg6] [%22, %23] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%25 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6)
%26 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%17, %arg5)
%27 = memref.subview %18[%arg6, %arg5] [%25, %26] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%28 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%15, %arg4)
%29 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%17, %arg5)
%30 = memref.subview %21[%arg4, %arg5] [%28, %29] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%24, %27 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%30 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>)
}
}
}
}
}
}
}
return
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test_dispatch_0() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %1 to %c10 step %2 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %3 to %c10 step %4 {
%5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%8 = memref.alloca(%5, %6) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<?x?xf32>) outs(%7 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%9 = linalg.index 0 : index
%10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %arg0)
%11 = linalg.index 1 : index
%12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %arg1)
%13 = cmpi eq, %10, %12 : index
%14 = select %13, %cst, %cst_0 : f32
linalg.yield %14 : f32
}
}
}
return
}
// -----// IR Dump After CSE //----- //
func @matmul_test_dispatch_2() {
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%c32 = constant 32 : index
%c4 = constant 4 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%14 = memref.alloca(%11, %12) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14 : memref<?x?xf32>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%15 = linalg.index 0 : index
%16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%15, %arg0)
%17 = linalg.index 1 : index
%18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1)
%19 = cmpi eq, %16, %18 : index
%20 = select %19, %cst_0, %cst : f32
linalg.yield %20 : f32
}
scf.for %arg2 = %c0 to %7 step %c32 {
scf.for %arg3 = %c0 to %9 step %c32 {
%15 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2)
%16 = memref.subview %8[%arg2, 0] [%15, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%17 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3)
%18 = memref.subview %10[0, %arg3] [10, %17] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%19 = memref.subview %13[%arg2, %arg3] [%15, %17] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg4 = %c0 to %15 step %c4 {
scf.for %arg5 = %c0 to %17 step %c4 {
scf.for %arg6 = %c0 to %c10 step %c4 {
%20 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%15, %arg4)
%21 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6)
%22 = memref.subview %16[%arg4, %arg6] [%20, %21] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%23 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%17, %arg5)
%24 = memref.subview %18[%arg6, %arg5] [%21, %23] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%25 = memref.subview %19[%arg4, %arg5] [%20, %23] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%22, %24 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%25 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>)
}
}
}
}
}
}
}
return
}
// -----// IR Dump After ForOpCanonicalization //----- //
func @matmul_test_dispatch_0() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %1 to %c10 step %2 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %3 to %c10 step %4 {
%5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%8 = memref.alloca(%5, %6) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<?x?xf32>) outs(%7 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%9 = linalg.index 0 : index
%10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %arg0)
%11 = linalg.index 1 : index
%12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %arg1)
%13 = cmpi eq, %10, %12 : index
%14 = select %13, %cst, %cst_0 : f32
linalg.yield %14 : f32
}
}
}
return
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test_dispatch_2() {
%c4 = constant 4 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%14 = memref.alloca(%11, %12) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14 : memref<?x?xf32>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%15 = linalg.index 0 : index
%16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%15, %arg0)
%17 = linalg.index 1 : index
%18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1)
%19 = cmpi eq, %16, %18 : index
%20 = select %19, %cst, %cst_0 : f32
linalg.yield %20 : f32
}
scf.for %arg2 = %c0 to %7 step %c32 {
scf.for %arg3 = %c0 to %9 step %c32 {
%15 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2)
%16 = memref.subview %8[%arg2, 0] [%15, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%17 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3)
%18 = memref.subview %10[0, %arg3] [10, %17] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%19 = memref.subview %13[%arg2, %arg3] [%15, %17] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg4 = %c0 to %15 step %c4 {
scf.for %arg5 = %c0 to %17 step %c4 {
scf.for %arg6 = %c0 to %c10 step %c4 {
%20 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%15, %arg4)
%21 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6)
%22 = memref.subview %16[%arg4, %arg6] [%20, %21] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%23 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%17, %arg5)
%24 = memref.subview %18[%arg6, %arg5] [%21, %23] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%25 = memref.subview %19[%arg4, %arg5] [%20, %23] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%22, %24 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%25 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>)
}
}
}
}
}
}
}
return
}
// -----// IR Dump After LLVMCPUPlanConvLoopOrder //----- //
func @matmul_test_dispatch_0() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %1 to %c10 step %2 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %3 to %c10 step %4 {
%5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%8 = memref.alloca(%5, %6) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<?x?xf32>) outs(%7 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%9 = linalg.index 0 : index
%10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %arg0)
%11 = linalg.index 1 : index
%12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %arg1)
%13 = cmpi eq, %10, %12 : index
%14 = select %13, %cst, %cst_0 : f32
linalg.yield %14 : f32
}
}
}
return
}
// -----// IR Dump After ForOpCanonicalization //----- //
func @matmul_test_dispatch_2() {
%c4 = constant 4 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%14 = memref.alloca(%11, %12) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14 : memref<?x?xf32>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%15 = linalg.index 0 : index
%16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%15, %arg0)
%17 = linalg.index 1 : index
%18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1)
%19 = cmpi eq, %16, %18 : index
%20 = select %19, %cst, %cst_0 : f32
linalg.yield %20 : f32
}
scf.for %arg2 = %c0 to %7 step %c32 {
scf.for %arg3 = %c0 to %9 step %c32 {
%15 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2)
%16 = memref.subview %8[%arg2, 0] [%15, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%17 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3)
%18 = memref.subview %10[0, %arg3] [10, %17] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%19 = memref.subview %13[%arg2, %arg3] [%15, %17] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg4 = %c0 to %15 step %c4 {
scf.for %arg5 = %c0 to %17 step %c4 {
scf.for %arg6 = %c0 to %c10 step %c4 {
%20 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%15, %arg4)
%21 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6)
%22 = memref.subview %16[%arg4, %arg6] [%20, %21] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%23 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%17, %arg5)
%24 = memref.subview %18[%arg6, %arg5] [%21, %23] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%25 = memref.subview %19[%arg4, %arg5] [%20, %23] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%22, %24 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%25 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>)
}
}
}
}
}
}
}
return
}
// -----// IR Dump After LLVMCPUPlanConvLoopOrder //----- //
func @matmul_test_dispatch_2() {
%c4 = constant 4 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%14 = memref.alloca(%11, %12) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14 : memref<?x?xf32>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%15 = linalg.index 0 : index
%16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%15, %arg0)
%17 = linalg.index 1 : index
%18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1)
%19 = cmpi eq, %16, %18 : index
%20 = select %19, %cst, %cst_0 : f32
linalg.yield %20 : f32
}
scf.for %arg2 = %c0 to %7 step %c32 {
scf.for %arg3 = %c0 to %9 step %c32 {
%15 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2)
%16 = memref.subview %8[%arg2, 0] [%15, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%17 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3)
%18 = memref.subview %10[0, %arg3] [10, %17] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%19 = memref.subview %13[%arg2, %arg3] [%15, %17] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg4 = %c0 to %15 step %c4 {
scf.for %arg5 = %c0 to %17 step %c4 {
scf.for %arg6 = %c0 to %c10 step %c4 {
%20 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%15, %arg4)
%21 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6)
%22 = memref.subview %16[%arg4, %arg6] [%20, %21] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%23 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%17, %arg5)
%24 = memref.subview %18[%arg6, %arg5] [%21, %23] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%25 = memref.subview %19[%arg4, %arg5] [%20, %23] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%22, %24 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%25 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>)
}
}
}
}
}
}
}
return
}
// -----// IR Dump After LLVMCPULowerExecutableTarget //----- //
hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> {
hal.executable.entry_point public @matmul_test_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg0]
%1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1]
hal.return %0, %1, %c1 : index, index, index
}
builtin.module {
func @matmul_test_dispatch_0() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %1 to %c10 step %2 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %3 to %c10 step %4 {
%5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%8 = memref.alloca(%5, %6) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<?x?xf32>) outs(%7 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%9 = linalg.index 0 : index
%10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %arg0)
%11 = linalg.index 1 : index
%12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %arg1)
%13 = cmpi eq, %10, %12 : index
%14 = select %13, %cst, %cst_0 : f32
linalg.yield %14 : f32
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After LLVMCPULowerExecutableTarget //----- //
hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> {
hal.executable.entry_point public @matmul_test_dispatch_2 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg0]
%1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1]
hal.return %0, %1, %c1 : index, index, index
}
builtin.module {
func @matmul_test_dispatch_2() {
%c4 = constant 4 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%14 = memref.alloca(%11, %12) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14 : memref<?x?xf32>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%15 = linalg.index 0 : index
%16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%15, %arg0)
%17 = linalg.index 1 : index
%18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1)
%19 = cmpi eq, %16, %18 : index
%20 = select %19, %cst, %cst_0 : f32
linalg.yield %20 : f32
}
scf.for %arg2 = %c0 to %7 step %c32 {
scf.for %arg3 = %c0 to %9 step %c32 {
%15 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2)
%16 = memref.subview %8[%arg2, 0] [%15, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%17 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3)
%18 = memref.subview %10[0, %arg3] [10, %17] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%19 = memref.subview %13[%arg2, %arg3] [%15, %17] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg4 = %c0 to %15 step %c4 {
scf.for %arg5 = %c0 to %17 step %c4 {
scf.for %arg6 = %c0 to %c10 step %c4 {
%20 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%15, %arg4)
%21 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6)
%22 = memref.subview %16[%arg4, %arg6] [%20, %21] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%23 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%17, %arg5)
%24 = memref.subview %18[%arg6, %arg5] [%21, %23] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%25 = memref.subview %19[%arg4, %arg5] [%20, %23] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%22, %24 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%25 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>)
}
}
}
}
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After LinalgExtToLoops //----- //
func @matmul_test_dispatch_0() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %1 to %c10 step %2 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %3 to %c10 step %4 {
%5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%8 = memref.alloca(%5, %6) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : memref<?x?xf32>) outs(%7 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[64, 64]]}} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%9 = linalg.index 0 : index
%10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %arg0)
%11 = linalg.index 1 : index
%12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %arg1)
%13 = cmpi eq, %10, %12 : index
%14 = select %13, %cst, %cst_0 : f32
linalg.yield %14 : f32
}
}
}
return
}
// -----// IR Dump After LinalgExtToLoops //----- //
func @matmul_test_dispatch_2() {
%c4 = constant 4 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%14 = memref.alloca(%11, %12) : memref<?x?xf32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%14 : memref<?x?xf32>) outs(%13 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%15 = linalg.index 0 : index
%16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%15, %arg0)
%17 = linalg.index 1 : index
%18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%17, %arg1)
%19 = cmpi eq, %16, %18 : index
%20 = select %19, %cst, %cst_0 : f32
linalg.yield %20 : f32
}
scf.for %arg2 = %c0 to %7 step %c32 {
scf.for %arg3 = %c0 to %9 step %c32 {
%15 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2)
%16 = memref.subview %8[%arg2, 0] [%15, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%17 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3)
%18 = memref.subview %10[0, %arg3] [10, %17] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%19 = memref.subview %13[%arg2, %arg3] [%15, %17] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg4 = %c0 to %15 step %c4 {
scf.for %arg5 = %c0 to %17 step %c4 {
scf.for %arg6 = %c0 to %c10 step %c4 {
%20 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%15, %arg4)
%21 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6)
%22 = memref.subview %16[%arg4, %arg6] [%20, %21] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%23 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%17, %arg5)
%24 = memref.subview %18[%arg6, %arg5] [%21, %23] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%25 = memref.subview %19[%arg4, %arg5] [%20, %23] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}} ins(%22, %24 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>) outs(%25 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>)
}
}
}
}
}
}
}
return
}
// -----// IR Dump After LinalgLowerToLoops //----- //
func @matmul_test_dispatch_0() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%c1 = constant 1 : index
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %1 to %c10 step %2 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %3 to %c10 step %4 {
%5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg2 = %c0 to %5 step %c1 {
scf.for %arg3 = %c0 to %6 step %c1 {
%8 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %arg0)
%9 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg1)
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
memref.store %11, %7[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
}
}
}
}
return
}
// -----// IR Dump After LinalgLowerToLoops //----- //
func @matmul_test_dispatch_2() {
%c4 = constant 4 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%c1 = constant 1 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg2 = %c0 to %11 step %c1 {
scf.for %arg3 = %c0 to %12 step %c1 {
%14 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %arg0)
%15 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg1)
%16 = cmpi eq, %14, %15 : index
%17 = select %16, %cst, %cst_0 : f32
memref.store %17, %13[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
}
}
scf.for %arg2 = %c0 to %7 step %c32 {
scf.for %arg3 = %c0 to %9 step %c32 {
%14 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2)
%15 = memref.subview %8[%arg2, 0] [%14, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%16 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3)
%17 = memref.subview %10[0, %arg3] [10, %16] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%18 = memref.subview %13[%arg2, %arg3] [%14, %16] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg4 = %c0 to %14 step %c4 {
scf.for %arg5 = %c0 to %16 step %c4 {
scf.for %arg6 = %c0 to %c10 step %c4 {
%19 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%14, %arg4)
%20 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6)
%21 = memref.subview %15[%arg4, %arg6] [%19, %20] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%22 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%16, %arg5)
%23 = memref.subview %17[%arg6, %arg5] [%20, %22] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%24 = memref.subview %18[%arg4, %arg5] [%19, %22] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg7 = %c0 to %19 step %c1 {
scf.for %arg8 = %c0 to %22 step %c1 {
scf.for %arg9 = %c0 to %20 step %c1 {
%25 = memref.load %21[%arg7, %arg9] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%26 = memref.load %23[%arg9, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%27 = memref.load %24[%arg7, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%28 = mulf %25, %26 : f32
%29 = addf %27, %28 : f32
memref.store %29, %24[%arg7, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
}
}
}
}
}
}
}
}
}
}
return
}
// -----// IR Dump After mlir::iree_compiler::Shape::{anonymous}::FoldDimOverShapeCarryingOpPass //----- //
func @matmul_test_dispatch_0() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%c1 = constant 1 : index
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %1 to %c10 step %2 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %3 to %c10 step %4 {
%5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg2 = %c0 to %5 step %c1 {
scf.for %arg3 = %c0 to %6 step %c1 {
%8 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %arg0)
%9 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg1)
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst, %cst_0 : f32
memref.store %11, %7[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
}
}
}
}
return
}
// -----// IR Dump After mlir::iree_compiler::Shape::{anonymous}::FoldDimOverShapeCarryingOpPass //----- //
func @matmul_test_dispatch_2() {
%c4 = constant 4 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%c1 = constant 1 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg2 = %c0 to %11 step %c1 {
scf.for %arg3 = %c0 to %12 step %c1 {
%14 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %arg0)
%15 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg1)
%16 = cmpi eq, %14, %15 : index
%17 = select %16, %cst, %cst_0 : f32
memref.store %17, %13[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
}
}
scf.for %arg2 = %c0 to %7 step %c32 {
scf.for %arg3 = %c0 to %9 step %c32 {
%14 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2)
%15 = memref.subview %8[%arg2, 0] [%14, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%16 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3)
%17 = memref.subview %10[0, %arg3] [10, %16] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%18 = memref.subview %13[%arg2, %arg3] [%14, %16] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg4 = %c0 to %14 step %c4 {
scf.for %arg5 = %c0 to %16 step %c4 {
scf.for %arg6 = %c0 to %c10 step %c4 {
%19 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%14, %arg4)
%20 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6)
%21 = memref.subview %15[%arg4, %arg6] [%19, %20] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%22 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%16, %arg5)
%23 = memref.subview %17[%arg6, %arg5] [%20, %22] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%24 = memref.subview %18[%arg4, %arg5] [%19, %22] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg7 = %c0 to %19 step %c1 {
scf.for %arg8 = %c0 to %22 step %c1 {
scf.for %arg9 = %c0 to %20 step %c1 {
%25 = memref.load %21[%arg7, %arg9] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%26 = memref.load %23[%arg9, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%27 = memref.load %24[%arg7, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%28 = mulf %25, %26 : f32
%29 = addf %27, %28 : f32
memref.store %29, %24[%arg7, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
}
}
}
}
}
}
}
}
}
}
return
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test_dispatch_0() {
%c1 = constant 1 : index
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %1 to %c10 step %2 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %3 to %c10 step %4 {
%5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg2 = %c0 to %5 step %c1 {
scf.for %arg3 = %c0 to %6 step %c1 {
%8 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %arg0)
%9 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg1)
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
memref.store %11, %7[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
}
}
}
}
return
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test_dispatch_2() {
%c1 = constant 1 : index
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%c32 = constant 32 : index
%c4 = constant 4 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg2 = %c0 to %11 step %c1 {
scf.for %arg3 = %c0 to %12 step %c1 {
%14 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %arg0)
%15 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg1)
%16 = cmpi eq, %14, %15 : index
%17 = select %16, %cst_0, %cst : f32
memref.store %17, %13[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
}
}
scf.for %arg2 = %c0 to %7 step %c32 {
scf.for %arg3 = %c0 to %9 step %c32 {
%14 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2)
%15 = memref.subview %8[%arg2, 0] [%14, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%16 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3)
%17 = memref.subview %10[0, %arg3] [10, %16] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%18 = memref.subview %13[%arg2, %arg3] [%14, %16] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg4 = %c0 to %14 step %c4 {
scf.for %arg5 = %c0 to %16 step %c4 {
scf.for %arg6 = %c0 to %c10 step %c4 {
%19 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%14, %arg4)
%20 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6)
%21 = memref.subview %15[%arg4, %arg6] [%19, %20] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%22 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%16, %arg5)
%23 = memref.subview %17[%arg6, %arg5] [%20, %22] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%24 = memref.subview %18[%arg4, %arg5] [%19, %22] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg7 = %c0 to %19 step %c1 {
scf.for %arg8 = %c0 to %22 step %c1 {
scf.for %arg9 = %c0 to %20 step %c1 {
%25 = memref.load %21[%arg7, %arg9] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%26 = memref.load %23[%arg9, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%27 = memref.load %24[%arg7, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%28 = mulf %25, %26 : f32
%29 = addf %27, %28 : f32
memref.store %29, %24[%arg7, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
}
}
}
}
}
}
}
}
}
}
return
}
// -----// IR Dump After CSE //----- //
func @matmul_test_dispatch_0() {
%c1 = constant 1 : index
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %1 to %c10 step %2 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %3 to %c10 step %4 {
%5 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%6 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%7 = memref.subview %0[%arg0, %arg1] [%5, %6] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg2 = %c0 to %5 step %c1 {
scf.for %arg3 = %c0 to %6 step %c1 {
%8 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %arg0)
%9 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg1)
%10 = cmpi eq, %8, %9 : index
%11 = select %10, %cst_0, %cst : f32
memref.store %11, %7[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
}
}
}
}
return
}
// -----// IR Dump After CSE //----- //
func @matmul_test_dispatch_2() {
%c1 = constant 1 : index
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%c32 = constant 32 : index
%c4 = constant 4 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c10 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c10 step %6 {
%7 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg0)
%8 = memref.subview %0[%arg0, 0] [%7, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%arg1)
%10 = memref.subview %1[0, %arg1] [10, %9] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%11 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg0)
%12 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%arg1)
%13 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg2 = %c0 to %11 step %c1 {
scf.for %arg3 = %c0 to %12 step %c1 {
%14 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg2, %arg0)
%15 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg1)
%16 = cmpi eq, %14, %15 : index
%17 = select %16, %cst_0, %cst : f32
memref.store %17, %13[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
}
}
scf.for %arg2 = %c0 to %7 step %c32 {
scf.for %arg3 = %c0 to %9 step %c32 {
%14 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%7, %arg2)
%15 = memref.subview %8[%arg2, 0] [%14, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%16 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%9, %arg3)
%17 = memref.subview %10[0, %arg3] [10, %16] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%18 = memref.subview %13[%arg2, %arg3] [%14, %16] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg4 = %c0 to %14 step %c4 {
scf.for %arg5 = %c0 to %16 step %c4 {
scf.for %arg6 = %c0 to %c10 step %c4 {
%19 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%14, %arg4)
%20 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%arg6)
%21 = memref.subview %15[%arg4, %arg6] [%19, %20] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%22 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%16, %arg5)
%23 = memref.subview %17[%arg6, %arg5] [%20, %22] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%24 = memref.subview %18[%arg4, %arg5] [%19, %22] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
scf.for %arg7 = %c0 to %19 step %c1 {
scf.for %arg8 = %c0 to %22 step %c1 {
scf.for %arg9 = %c0 to %20 step %c1 {
%25 = memref.load %21[%arg7, %arg9] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%26 = memref.load %23[%arg9, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%27 = memref.load %24[%arg7, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%28 = mulf %25, %26 : f32
%29 = addf %27, %28 : f32
memref.store %29, %24[%arg7, %arg8] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
}
}
}
}
}
}
}
}
}
}
return
}
// -----// IR Dump After SCFToStandard //----- //
func @matmul_test_dispatch_0() {
%c1 = constant 1 : index
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
br ^bb1(%1 : index)
^bb1(%3: index): // 2 preds: ^bb0, ^bb11
%4 = cmpi slt, %3, %c10 : index
cond_br %4, ^bb2, ^bb12
^bb2: // pred: ^bb1
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
br ^bb3(%5 : index)
^bb3(%7: index): // 2 preds: ^bb2, ^bb10
%8 = cmpi slt, %7, %c10 : index
cond_br %8, ^bb4, ^bb11
^bb4: // pred: ^bb3
%9 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%3)
%10 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%7)
%11 = memref.subview %0[%3, %7] [%9, %10] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
br ^bb5(%c0 : index)
^bb5(%12: index): // 2 preds: ^bb4, ^bb9
%13 = cmpi slt, %12, %9 : index
cond_br %13, ^bb6, ^bb10
^bb6: // pred: ^bb5
br ^bb7(%c0 : index)
^bb7(%14: index): // 2 preds: ^bb6, ^bb8
%15 = cmpi slt, %14, %10 : index
cond_br %15, ^bb8, ^bb9
^bb8: // pred: ^bb7
%16 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%12, %3)
%17 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%14, %7)
%18 = cmpi eq, %16, %17 : index
%19 = select %18, %cst_0, %cst : f32
memref.store %19, %11[%12, %14] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%20 = addi %14, %c1 : index
br ^bb7(%20 : index)
^bb9: // pred: ^bb7
%21 = addi %12, %c1 : index
br ^bb5(%21 : index)
^bb10: // pred: ^bb5
%22 = addi %7, %6 : index
br ^bb3(%22 : index)
^bb11: // pred: ^bb3
%23 = addi %3, %2 : index
br ^bb1(%23 : index)
^bb12: // pred: ^bb1
return
}
// -----// IR Dump After SCFToStandard //----- //
func @matmul_test_dispatch_2() {
%c1 = constant 1 : index
%cst = constant 1.000000e+00 : f32
%cst_0 = constant 0.000000e+00 : f32
%c10 = constant 10 : index
%c0 = constant 0 : index
%c32 = constant 32 : index
%c4 = constant 4 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
br ^bb1(%3 : index)
^bb1(%5: index): // 2 preds: ^bb0, ^bb35
%6 = cmpi slt, %5, %c10 : index
cond_br %6, ^bb2, ^bb36
^bb2: // pred: ^bb1
%7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
br ^bb3(%7 : index)
^bb3(%9: index): // 2 preds: ^bb2, ^bb34
%10 = cmpi slt, %9, %c10 : index
cond_br %10, ^bb4, ^bb35
^bb4: // pred: ^bb3
%11 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%5)
%12 = memref.subview %0[%5, 0] [%11, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%13 = affine.min affine_map<(d0) -> (64, -d0 + 10)>(%9)
%14 = memref.subview %1[0, %9] [10, %13] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%15 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%5)
%16 = affine.min affine_map<(d0) -> (-d0 + 10, 64)>(%9)
%17 = memref.subview %2[%5, %9] [%11, %13] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
br ^bb5(%c0 : index)
^bb5(%18: index): // 2 preds: ^bb4, ^bb9
%19 = cmpi slt, %18, %15 : index
cond_br %19, ^bb6, ^bb10
^bb6: // pred: ^bb5
br ^bb7(%c0 : index)
^bb7(%20: index): // 2 preds: ^bb6, ^bb8
%21 = cmpi slt, %20, %16 : index
cond_br %21, ^bb8, ^bb9
^bb8: // pred: ^bb7
%22 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%18, %5)
%23 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%20, %9)
%24 = cmpi eq, %22, %23 : index
%25 = select %24, %cst_0, %cst : f32
memref.store %25, %17[%18, %20] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%26 = addi %20, %c1 : index
br ^bb7(%26 : index)
^bb9: // pred: ^bb7
%27 = addi %18, %c1 : index
br ^bb5(%27 : index)
^bb10: // pred: ^bb5
br ^bb11(%c0 : index)
^bb11(%28: index): // 2 preds: ^bb10, ^bb33
%29 = cmpi slt, %28, %11 : index
cond_br %29, ^bb12, ^bb34
^bb12: // pred: ^bb11
br ^bb13(%c0 : index)
^bb13(%30: index): // 2 preds: ^bb12, ^bb32
%31 = cmpi slt, %30, %13 : index
cond_br %31, ^bb14, ^bb33
^bb14: // pred: ^bb13
%32 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%11, %28)
%33 = memref.subview %12[%28, 0] [%32, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%34 = affine.min affine_map<(d0, d1) -> (32, d0 - d1)>(%13, %30)
%35 = memref.subview %14[0, %30] [10, %34] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%36 = memref.subview %17[%28, %30] [%32, %34] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
br ^bb15(%c0 : index)
^bb15(%37: index): // 2 preds: ^bb14, ^bb31
%38 = cmpi slt, %37, %32 : index
cond_br %38, ^bb16, ^bb32
^bb16: // pred: ^bb15
br ^bb17(%c0 : index)
^bb17(%39: index): // 2 preds: ^bb16, ^bb30
%40 = cmpi slt, %39, %34 : index
cond_br %40, ^bb18, ^bb31
^bb18: // pred: ^bb17
br ^bb19(%c0 : index)
^bb19(%41: index): // 2 preds: ^bb18, ^bb29
%42 = cmpi slt, %41, %c10 : index
cond_br %42, ^bb20, ^bb30
^bb20: // pred: ^bb19
%43 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%32, %37)
%44 = affine.min affine_map<(d0) -> (4, -d0 + 10)>(%41)
%45 = memref.subview %33[%37, %41] [%43, %44] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%46 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%34, %39)
%47 = memref.subview %35[%41, %39] [%44, %46] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%48 = memref.subview %36[%37, %39] [%43, %46] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
br ^bb21(%c0 : index)
^bb21(%49: index): // 2 preds: ^bb20, ^bb28
%50 = cmpi slt, %49, %43 : index
cond_br %50, ^bb22, ^bb29
^bb22: // pred: ^bb21
br ^bb23(%c0 : index)
^bb23(%51: index): // 2 preds: ^bb22, ^bb27
%52 = cmpi slt, %51, %46 : index
cond_br %52, ^bb24, ^bb28
^bb24: // pred: ^bb23
br ^bb25(%c0 : index)
^bb25(%53: index): // 2 preds: ^bb24, ^bb26
%54 = cmpi slt, %53, %44 : index
cond_br %54, ^bb26, ^bb27
^bb26: // pred: ^bb25
%55 = memref.load %45[%49, %53] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%56 = memref.load %47[%53, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%57 = memref.load %48[%49, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%58 = mulf %55, %56 : f32
%59 = addf %57, %58 : f32
memref.store %59, %48[%49, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%60 = addi %53, %c1 : index
br ^bb25(%60 : index)
^bb27: // pred: ^bb25
%61 = addi %51, %c1 : index
br ^bb23(%61 : index)
^bb28: // pred: ^bb23
%62 = addi %49, %c1 : index
br ^bb21(%62 : index)
^bb29: // pred: ^bb21
%63 = addi %41, %c4 : index
br ^bb19(%63 : index)
^bb30: // pred: ^bb19
%64 = addi %39, %c4 : index
br ^bb17(%64 : index)
^bb31: // pred: ^bb17
%65 = addi %37, %c4 : index
br ^bb15(%65 : index)
^bb32: // pred: ^bb15
%66 = addi %30, %c32 : index
br ^bb13(%66 : index)
^bb33: // pred: ^bb13
%67 = addi %28, %c32 : index
br ^bb11(%67 : index)
^bb34: // pred: ^bb11
%68 = addi %9, %8 : index
br ^bb3(%68 : index)
^bb35: // pred: ^bb3
%69 = addi %5, %4 : index
br ^bb1(%69 : index)
^bb36: // pred: ^bb1
return
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test_dispatch_0() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%c1 = constant 1 : index
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
br ^bb1(%1 : index)
^bb1(%3: index): // 2 preds: ^bb0, ^bb10
%4 = cmpi slt, %3, %c10 : index
cond_br %4, ^bb2, ^bb11
^bb2: // pred: ^bb1
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
br ^bb3(%5 : index)
^bb3(%7: index): // 2 preds: ^bb2, ^bb9
%8 = cmpi slt, %7, %c10 : index
cond_br %8, ^bb4, ^bb10
^bb4: // pred: ^bb3
%9 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%3]
%10 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%7]
%11 = memref.subview %0[%3, %7] [%9, %10] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
br ^bb5(%c0 : index)
^bb5(%12: index): // 2 preds: ^bb4, ^bb8
%13 = cmpi slt, %12, %9 : index
cond_br %13, ^bb6(%c0 : index), ^bb9
^bb6(%14: index): // 2 preds: ^bb5, ^bb7
%15 = cmpi slt, %14, %10 : index
cond_br %15, ^bb7, ^bb8
^bb7: // pred: ^bb6
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%12, %3]
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%14, %7]
%18 = cmpi eq, %16, %17 : index
%19 = select %18, %cst, %cst_0 : f32
memref.store %19, %11[%12, %14] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%20 = addi %14, %c1 : index
br ^bb6(%20 : index)
^bb8: // pred: ^bb6
%21 = addi %12, %c1 : index
br ^bb5(%21 : index)
^bb9: // pred: ^bb5
%22 = addi %7, %6 : index
br ^bb3(%22 : index)
^bb10: // pred: ^bb3
%23 = addi %3, %2 : index
br ^bb1(%23 : index)
^bb11: // pred: ^bb1
return
}
// -----// IR Dump After CSE //----- //
func @matmul_test_dispatch_0() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%c1 = constant 1 : index
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
br ^bb1(%1 : index)
^bb1(%3: index): // 2 preds: ^bb0, ^bb10
%4 = cmpi slt, %3, %c10 : index
cond_br %4, ^bb2, ^bb11
^bb2: // pred: ^bb1
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
br ^bb3(%5 : index)
^bb3(%7: index): // 2 preds: ^bb2, ^bb9
%8 = cmpi slt, %7, %c10 : index
cond_br %8, ^bb4, ^bb10
^bb4: // pred: ^bb3
%9 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%3]
%10 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%7]
%11 = memref.subview %0[%3, %7] [%9, %10] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
br ^bb5(%c0 : index)
^bb5(%12: index): // 2 preds: ^bb4, ^bb8
%13 = cmpi slt, %12, %9 : index
cond_br %13, ^bb6(%c0 : index), ^bb9
^bb6(%14: index): // 2 preds: ^bb5, ^bb7
%15 = cmpi slt, %14, %10 : index
cond_br %15, ^bb7, ^bb8
^bb7: // pred: ^bb6
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%12, %3]
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%14, %7]
%18 = cmpi eq, %16, %17 : index
%19 = select %18, %cst, %cst_0 : f32
memref.store %19, %11[%12, %14] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%20 = addi %14, %c1 : index
br ^bb6(%20 : index)
^bb8: // pred: ^bb6
%21 = addi %12, %c1 : index
br ^bb5(%21 : index)
^bb9: // pred: ^bb5
%22 = addi %7, %6 : index
br ^bb3(%22 : index)
^bb10: // pred: ^bb3
%23 = addi %3, %2 : index
br ^bb1(%23 : index)
^bb11: // pred: ^bb1
return
}
// -----// IR Dump After Canonicalizer //----- //
func @matmul_test_dispatch_2() {
%c4 = constant 4 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%c1 = constant 1 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
br ^bb1(%3 : index)
^bb1(%5: index): // 2 preds: ^bb0, ^bb28
%6 = cmpi slt, %5, %c10 : index
cond_br %6, ^bb2, ^bb29
^bb2: // pred: ^bb1
%7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
br ^bb3(%7 : index)
^bb3(%9: index): // 2 preds: ^bb2, ^bb27
%10 = cmpi slt, %9, %c10 : index
cond_br %10, ^bb4, ^bb28
^bb4: // pred: ^bb3
%11 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%5]
%12 = memref.subview %0[%5, 0] [%11, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%13 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%9]
%14 = memref.subview %1[0, %9] [10, %13] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%15 = affine.min affine_map<()[s0] -> (-s0 + 10, 64)>()[%5]
%16 = affine.min affine_map<()[s0] -> (-s0 + 10, 64)>()[%9]
%17 = memref.subview %2[%5, %9] [%11, %13] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
br ^bb5(%c0 : index)
^bb5(%18: index): // 2 preds: ^bb4, ^bb8
%19 = cmpi slt, %18, %15 : index
cond_br %19, ^bb6(%c0 : index), ^bb9(%c0 : index)
^bb6(%20: index): // 2 preds: ^bb5, ^bb7
%21 = cmpi slt, %20, %16 : index
cond_br %21, ^bb7, ^bb8
^bb7: // pred: ^bb6
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%18, %5]
%23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%20, %9]
%24 = cmpi eq, %22, %23 : index
%25 = select %24, %cst, %cst_0 : f32
memref.store %25, %17[%18, %20] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%26 = addi %20, %c1 : index
br ^bb6(%26 : index)
^bb8: // pred: ^bb6
%27 = addi %18, %c1 : index
br ^bb5(%27 : index)
^bb9(%28: index): // 2 preds: ^bb5, ^bb26
%29 = cmpi slt, %28, %11 : index
cond_br %29, ^bb10(%c0 : index), ^bb27
^bb10(%30: index): // 2 preds: ^bb9, ^bb25
%31 = cmpi slt, %30, %13 : index
cond_br %31, ^bb11, ^bb26
^bb11: // pred: ^bb10
%32 = affine.min affine_map<()[s0, s1] -> (32, s0 - s1)>()[%11, %28]
%33 = memref.subview %12[%28, 0] [%32, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%34 = affine.min affine_map<()[s0, s1] -> (32, s0 - s1)>()[%13, %30]
%35 = memref.subview %14[0, %30] [10, %34] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%36 = memref.subview %17[%28, %30] [%32, %34] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
br ^bb12(%c0 : index)
^bb12(%37: index): // 2 preds: ^bb11, ^bb24
%38 = cmpi slt, %37, %32 : index
cond_br %38, ^bb13(%c0 : index), ^bb25
^bb13(%39: index): // 2 preds: ^bb12, ^bb23
%40 = cmpi slt, %39, %34 : index
cond_br %40, ^bb14(%c0 : index), ^bb24
^bb14(%41: index): // 2 preds: ^bb13, ^bb22
%42 = cmpi slt, %41, %c10 : index
cond_br %42, ^bb15, ^bb23
^bb15: // pred: ^bb14
%43 = affine.min affine_map<()[s0, s1] -> (4, s0 - s1)>()[%32, %37]
%44 = affine.min affine_map<()[s0] -> (4, -s0 + 10)>()[%41]
%45 = memref.subview %33[%37, %41] [%43, %44] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%46 = affine.min affine_map<()[s0, s1] -> (4, s0 - s1)>()[%34, %39]
%47 = memref.subview %35[%41, %39] [%44, %46] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%48 = memref.subview %36[%37, %39] [%43, %46] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
br ^bb16(%c0 : index)
^bb16(%49: index): // 2 preds: ^bb15, ^bb21
%50 = cmpi slt, %49, %43 : index
cond_br %50, ^bb17(%c0 : index), ^bb22
^bb17(%51: index): // 2 preds: ^bb16, ^bb20
%52 = cmpi slt, %51, %46 : index
cond_br %52, ^bb18(%c0 : index), ^bb21
^bb18(%53: index): // 2 preds: ^bb17, ^bb19
%54 = cmpi slt, %53, %44 : index
cond_br %54, ^bb19, ^bb20
^bb19: // pred: ^bb18
%55 = memref.load %45[%49, %53] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%56 = memref.load %47[%53, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%57 = memref.load %48[%49, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%58 = mulf %55, %56 : f32
%59 = addf %57, %58 : f32
memref.store %59, %48[%49, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%60 = addi %53, %c1 : index
br ^bb18(%60 : index)
^bb20: // pred: ^bb18
%61 = addi %51, %c1 : index
br ^bb17(%61 : index)
^bb21: // pred: ^bb17
%62 = addi %49, %c1 : index
br ^bb16(%62 : index)
^bb22: // pred: ^bb16
%63 = addi %41, %c4 : index
br ^bb14(%63 : index)
^bb23: // pred: ^bb14
%64 = addi %39, %c4 : index
br ^bb13(%64 : index)
^bb24: // pred: ^bb13
%65 = addi %37, %c4 : index
br ^bb12(%65 : index)
^bb25: // pred: ^bb12
%66 = addi %30, %c32 : index
br ^bb10(%66 : index)
^bb26: // pred: ^bb10
%67 = addi %28, %c32 : index
br ^bb9(%67 : index)
^bb27: // pred: ^bb9
%68 = addi %9, %8 : index
br ^bb3(%68 : index)
^bb28: // pred: ^bb3
%69 = addi %5, %4 : index
br ^bb1(%69 : index)
^bb29: // pred: ^bb1
return
}
// -----// IR Dump After TensorConstantBufferize //----- //
module {
func @matmul_test_dispatch_0() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%c1 = constant 1 : index
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
br ^bb1(%1 : index)
^bb1(%3: index): // 2 preds: ^bb0, ^bb10
%4 = cmpi slt, %3, %c10 : index
cond_br %4, ^bb2, ^bb11
^bb2: // pred: ^bb1
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
br ^bb3(%5 : index)
^bb3(%7: index): // 2 preds: ^bb2, ^bb9
%8 = cmpi slt, %7, %c10 : index
cond_br %8, ^bb4, ^bb10
^bb4: // pred: ^bb3
%9 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%3]
%10 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%7]
%11 = memref.subview %0[%3, %7] [%9, %10] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
br ^bb5(%c0 : index)
^bb5(%12: index): // 2 preds: ^bb4, ^bb8
%13 = cmpi slt, %12, %9 : index
cond_br %13, ^bb6(%c0 : index), ^bb9
^bb6(%14: index): // 2 preds: ^bb5, ^bb7
%15 = cmpi slt, %14, %10 : index
cond_br %15, ^bb7, ^bb8
^bb7: // pred: ^bb6
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%12, %3]
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%14, %7]
%18 = cmpi eq, %16, %17 : index
%19 = select %18, %cst, %cst_0 : f32
memref.store %19, %11[%12, %14] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%20 = addi %14, %c1 : index
br ^bb6(%20 : index)
^bb8: // pred: ^bb6
%21 = addi %12, %c1 : index
br ^bb5(%21 : index)
^bb9: // pred: ^bb5
%22 = addi %7, %6 : index
br ^bb3(%22 : index)
^bb10: // pred: ^bb3
%23 = addi %3, %2 : index
br ^bb1(%23 : index)
^bb11: // pred: ^bb1
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After CSE //----- //
func @matmul_test_dispatch_2() {
%c4 = constant 4 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%c1 = constant 1 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
br ^bb1(%3 : index)
^bb1(%5: index): // 2 preds: ^bb0, ^bb28
%6 = cmpi slt, %5, %c10 : index
cond_br %6, ^bb2, ^bb29
^bb2: // pred: ^bb1
%7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
br ^bb3(%7 : index)
^bb3(%9: index): // 2 preds: ^bb2, ^bb27
%10 = cmpi slt, %9, %c10 : index
cond_br %10, ^bb4, ^bb28
^bb4: // pred: ^bb3
%11 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%5]
%12 = memref.subview %0[%5, 0] [%11, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%13 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%9]
%14 = memref.subview %1[0, %9] [10, %13] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%15 = affine.min affine_map<()[s0] -> (-s0 + 10, 64)>()[%5]
%16 = affine.min affine_map<()[s0] -> (-s0 + 10, 64)>()[%9]
%17 = memref.subview %2[%5, %9] [%11, %13] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
br ^bb5(%c0 : index)
^bb5(%18: index): // 2 preds: ^bb4, ^bb8
%19 = cmpi slt, %18, %15 : index
cond_br %19, ^bb6(%c0 : index), ^bb9(%c0 : index)
^bb6(%20: index): // 2 preds: ^bb5, ^bb7
%21 = cmpi slt, %20, %16 : index
cond_br %21, ^bb7, ^bb8
^bb7: // pred: ^bb6
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%18, %5]
%23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%20, %9]
%24 = cmpi eq, %22, %23 : index
%25 = select %24, %cst, %cst_0 : f32
memref.store %25, %17[%18, %20] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%26 = addi %20, %c1 : index
br ^bb6(%26 : index)
^bb8: // pred: ^bb6
%27 = addi %18, %c1 : index
br ^bb5(%27 : index)
^bb9(%28: index): // 2 preds: ^bb5, ^bb26
%29 = cmpi slt, %28, %11 : index
cond_br %29, ^bb10(%c0 : index), ^bb27
^bb10(%30: index): // 2 preds: ^bb9, ^bb25
%31 = cmpi slt, %30, %13 : index
cond_br %31, ^bb11, ^bb26
^bb11: // pred: ^bb10
%32 = affine.min affine_map<()[s0, s1] -> (32, s0 - s1)>()[%11, %28]
%33 = memref.subview %12[%28, 0] [%32, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%34 = affine.min affine_map<()[s0, s1] -> (32, s0 - s1)>()[%13, %30]
%35 = memref.subview %14[0, %30] [10, %34] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%36 = memref.subview %17[%28, %30] [%32, %34] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
br ^bb12(%c0 : index)
^bb12(%37: index): // 2 preds: ^bb11, ^bb24
%38 = cmpi slt, %37, %32 : index
cond_br %38, ^bb13(%c0 : index), ^bb25
^bb13(%39: index): // 2 preds: ^bb12, ^bb23
%40 = cmpi slt, %39, %34 : index
cond_br %40, ^bb14(%c0 : index), ^bb24
^bb14(%41: index): // 2 preds: ^bb13, ^bb22
%42 = cmpi slt, %41, %c10 : index
cond_br %42, ^bb15, ^bb23
^bb15: // pred: ^bb14
%43 = affine.min affine_map<()[s0, s1] -> (4, s0 - s1)>()[%32, %37]
%44 = affine.min affine_map<()[s0] -> (4, -s0 + 10)>()[%41]
%45 = memref.subview %33[%37, %41] [%43, %44] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%46 = affine.min affine_map<()[s0, s1] -> (4, s0 - s1)>()[%34, %39]
%47 = memref.subview %35[%41, %39] [%44, %46] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%48 = memref.subview %36[%37, %39] [%43, %46] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
br ^bb16(%c0 : index)
^bb16(%49: index): // 2 preds: ^bb15, ^bb21
%50 = cmpi slt, %49, %43 : index
cond_br %50, ^bb17(%c0 : index), ^bb22
^bb17(%51: index): // 2 preds: ^bb16, ^bb20
%52 = cmpi slt, %51, %46 : index
cond_br %52, ^bb18(%c0 : index), ^bb21
^bb18(%53: index): // 2 preds: ^bb17, ^bb19
%54 = cmpi slt, %53, %44 : index
cond_br %54, ^bb19, ^bb20
^bb19: // pred: ^bb18
%55 = memref.load %45[%49, %53] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%56 = memref.load %47[%53, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%57 = memref.load %48[%49, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%58 = mulf %55, %56 : f32
%59 = addf %57, %58 : f32
memref.store %59, %48[%49, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%60 = addi %53, %c1 : index
br ^bb18(%60 : index)
^bb20: // pred: ^bb18
%61 = addi %51, %c1 : index
br ^bb17(%61 : index)
^bb21: // pred: ^bb17
%62 = addi %49, %c1 : index
br ^bb16(%62 : index)
^bb22: // pred: ^bb16
%63 = addi %41, %c4 : index
br ^bb14(%63 : index)
^bb23: // pred: ^bb14
%64 = addi %39, %c4 : index
br ^bb13(%64 : index)
^bb24: // pred: ^bb13
%65 = addi %37, %c4 : index
br ^bb12(%65 : index)
^bb25: // pred: ^bb12
%66 = addi %30, %c32 : index
br ^bb10(%66 : index)
^bb26: // pred: ^bb10
%67 = addi %28, %c32 : index
br ^bb9(%67 : index)
^bb27: // pred: ^bb9
%68 = addi %9, %8 : index
br ^bb3(%68 : index)
^bb28: // pred: ^bb3
%69 = addi %5, %4 : index
br ^bb1(%69 : index)
^bb29: // pred: ^bb1
return
}
// -----// IR Dump After FoldTensorExtractOp //----- //
module {
func @matmul_test_dispatch_0() {
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%c1 = constant 1 : index
%0 = hal.interface.binding.subspan @io::@s0b0_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%1 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
br ^bb1(%1 : index)
^bb1(%3: index): // 2 preds: ^bb0, ^bb10
%4 = cmpi slt, %3, %c10 : index
cond_br %4, ^bb2, ^bb11
^bb2: // pred: ^bb1
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
br ^bb3(%5 : index)
^bb3(%7: index): // 2 preds: ^bb2, ^bb9
%8 = cmpi slt, %7, %c10 : index
cond_br %8, ^bb4, ^bb10
^bb4: // pred: ^bb3
%9 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%3]
%10 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%7]
%11 = memref.subview %0[%3, %7] [%9, %10] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
br ^bb5(%c0 : index)
^bb5(%12: index): // 2 preds: ^bb4, ^bb8
%13 = cmpi slt, %12, %9 : index
cond_br %13, ^bb6(%c0 : index), ^bb9
^bb6(%14: index): // 2 preds: ^bb5, ^bb7
%15 = cmpi slt, %14, %10 : index
cond_br %15, ^bb7, ^bb8
^bb7: // pred: ^bb6
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%12, %3]
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%14, %7]
%18 = cmpi eq, %16, %17 : index
%19 = select %18, %cst, %cst_0 : f32
memref.store %19, %11[%12, %14] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%20 = addi %14, %c1 : index
br ^bb6(%20 : index)
^bb8: // pred: ^bb6
%21 = addi %12, %c1 : index
br ^bb5(%21 : index)
^bb9: // pred: ^bb5
%22 = addi %7, %6 : index
br ^bb3(%22 : index)
^bb10: // pred: ^bb3
%23 = addi %3, %2 : index
br ^bb1(%23 : index)
^bb11: // pred: ^bb1
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After TensorConstantBufferize //----- //
module {
func @matmul_test_dispatch_2() {
%c4 = constant 4 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%c1 = constant 1 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
br ^bb1(%3 : index)
^bb1(%5: index): // 2 preds: ^bb0, ^bb28
%6 = cmpi slt, %5, %c10 : index
cond_br %6, ^bb2, ^bb29
^bb2: // pred: ^bb1
%7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
br ^bb3(%7 : index)
^bb3(%9: index): // 2 preds: ^bb2, ^bb27
%10 = cmpi slt, %9, %c10 : index
cond_br %10, ^bb4, ^bb28
^bb4: // pred: ^bb3
%11 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%5]
%12 = memref.subview %0[%5, 0] [%11, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%13 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%9]
%14 = memref.subview %1[0, %9] [10, %13] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%15 = affine.min affine_map<()[s0] -> (-s0 + 10, 64)>()[%5]
%16 = affine.min affine_map<()[s0] -> (-s0 + 10, 64)>()[%9]
%17 = memref.subview %2[%5, %9] [%11, %13] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
br ^bb5(%c0 : index)
^bb5(%18: index): // 2 preds: ^bb4, ^bb8
%19 = cmpi slt, %18, %15 : index
cond_br %19, ^bb6(%c0 : index), ^bb9(%c0 : index)
^bb6(%20: index): // 2 preds: ^bb5, ^bb7
%21 = cmpi slt, %20, %16 : index
cond_br %21, ^bb7, ^bb8
^bb7: // pred: ^bb6
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%18, %5]
%23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%20, %9]
%24 = cmpi eq, %22, %23 : index
%25 = select %24, %cst, %cst_0 : f32
memref.store %25, %17[%18, %20] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%26 = addi %20, %c1 : index
br ^bb6(%26 : index)
^bb8: // pred: ^bb6
%27 = addi %18, %c1 : index
br ^bb5(%27 : index)
^bb9(%28: index): // 2 preds: ^bb5, ^bb26
%29 = cmpi slt, %28, %11 : index
cond_br %29, ^bb10(%c0 : index), ^bb27
^bb10(%30: index): // 2 preds: ^bb9, ^bb25
%31 = cmpi slt, %30, %13 : index
cond_br %31, ^bb11, ^bb26
^bb11: // pred: ^bb10
%32 = affine.min affine_map<()[s0, s1] -> (32, s0 - s1)>()[%11, %28]
%33 = memref.subview %12[%28, 0] [%32, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%34 = affine.min affine_map<()[s0, s1] -> (32, s0 - s1)>()[%13, %30]
%35 = memref.subview %14[0, %30] [10, %34] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%36 = memref.subview %17[%28, %30] [%32, %34] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
br ^bb12(%c0 : index)
^bb12(%37: index): // 2 preds: ^bb11, ^bb24
%38 = cmpi slt, %37, %32 : index
cond_br %38, ^bb13(%c0 : index), ^bb25
^bb13(%39: index): // 2 preds: ^bb12, ^bb23
%40 = cmpi slt, %39, %34 : index
cond_br %40, ^bb14(%c0 : index), ^bb24
^bb14(%41: index): // 2 preds: ^bb13, ^bb22
%42 = cmpi slt, %41, %c10 : index
cond_br %42, ^bb15, ^bb23
^bb15: // pred: ^bb14
%43 = affine.min affine_map<()[s0, s1] -> (4, s0 - s1)>()[%32, %37]
%44 = affine.min affine_map<()[s0] -> (4, -s0 + 10)>()[%41]
%45 = memref.subview %33[%37, %41] [%43, %44] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%46 = affine.min affine_map<()[s0, s1] -> (4, s0 - s1)>()[%34, %39]
%47 = memref.subview %35[%41, %39] [%44, %46] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%48 = memref.subview %36[%37, %39] [%43, %46] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
br ^bb16(%c0 : index)
^bb16(%49: index): // 2 preds: ^bb15, ^bb21
%50 = cmpi slt, %49, %43 : index
cond_br %50, ^bb17(%c0 : index), ^bb22
^bb17(%51: index): // 2 preds: ^bb16, ^bb20
%52 = cmpi slt, %51, %46 : index
cond_br %52, ^bb18(%c0 : index), ^bb21
^bb18(%53: index): // 2 preds: ^bb17, ^bb19
%54 = cmpi slt, %53, %44 : index
cond_br %54, ^bb19, ^bb20
^bb19: // pred: ^bb18
%55 = memref.load %45[%49, %53] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%56 = memref.load %47[%53, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%57 = memref.load %48[%49, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%58 = mulf %55, %56 : f32
%59 = addf %57, %58 : f32
memref.store %59, %48[%49, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%60 = addi %53, %c1 : index
br ^bb18(%60 : index)
^bb20: // pred: ^bb18
%61 = addi %51, %c1 : index
br ^bb17(%61 : index)
^bb21: // pred: ^bb17
%62 = addi %49, %c1 : index
br ^bb16(%62 : index)
^bb22: // pred: ^bb16
%63 = addi %41, %c4 : index
br ^bb14(%63 : index)
^bb23: // pred: ^bb14
%64 = addi %39, %c4 : index
br ^bb13(%64 : index)
^bb24: // pred: ^bb13
%65 = addi %37, %c4 : index
br ^bb12(%65 : index)
^bb25: // pred: ^bb12
%66 = addi %30, %c32 : index
br ^bb10(%66 : index)
^bb26: // pred: ^bb10
%67 = addi %28, %c32 : index
br ^bb9(%67 : index)
^bb27: // pred: ^bb9
%68 = addi %9, %8 : index
br ^bb3(%68 : index)
^bb28: // pred: ^bb3
%69 = addi %5, %4 : index
br ^bb1(%69 : index)
^bb29: // pred: ^bb1
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After FoldTensorExtractOp //----- //
module {
func @matmul_test_dispatch_2() {
%c4 = constant 4 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c10 = constant 10 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant 1.000000e+00 : f32
%c1 = constant 1 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<10x10xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<10x10xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<10x10xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
br ^bb1(%3 : index)
^bb1(%5: index): // 2 preds: ^bb0, ^bb28
%6 = cmpi slt, %5, %c10 : index
cond_br %6, ^bb2, ^bb29
^bb2: // pred: ^bb1
%7 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
br ^bb3(%7 : index)
^bb3(%9: index): // 2 preds: ^bb2, ^bb27
%10 = cmpi slt, %9, %c10 : index
cond_br %10, ^bb4, ^bb28
^bb4: // pred: ^bb3
%11 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%5]
%12 = memref.subview %0[%5, 0] [%11, 10] [1, 1] : memref<10x10xf32> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%13 = affine.min affine_map<()[s0] -> (64, -s0 + 10)>()[%9]
%14 = memref.subview %1[0, %9] [10, %13] [1, 1] : memref<10x10xf32> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%15 = affine.min affine_map<()[s0] -> (-s0 + 10, 64)>()[%5]
%16 = affine.min affine_map<()[s0] -> (-s0 + 10, 64)>()[%9]
%17 = memref.subview %2[%5, %9] [%11, %13] [1, 1] : memref<10x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
br ^bb5(%c0 : index)
^bb5(%18: index): // 2 preds: ^bb4, ^bb8
%19 = cmpi slt, %18, %15 : index
cond_br %19, ^bb6(%c0 : index), ^bb9(%c0 : index)
^bb6(%20: index): // 2 preds: ^bb5, ^bb7
%21 = cmpi slt, %20, %16 : index
cond_br %21, ^bb7, ^bb8
^bb7: // pred: ^bb6
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%18, %5]
%23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%20, %9]
%24 = cmpi eq, %22, %23 : index
%25 = select %24, %cst, %cst_0 : f32
memref.store %25, %17[%18, %20] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%26 = addi %20, %c1 : index
br ^bb6(%26 : index)
^bb8: // pred: ^bb6
%27 = addi %18, %c1 : index
br ^bb5(%27 : index)
^bb9(%28: index): // 2 preds: ^bb5, ^bb26
%29 = cmpi slt, %28, %11 : index
cond_br %29, ^bb10(%c0 : index), ^bb27
^bb10(%30: index): // 2 preds: ^bb9, ^bb25
%31 = cmpi slt, %30, %13 : index
cond_br %31, ^bb11, ^bb26
^bb11: // pred: ^bb10
%32 = affine.min affine_map<()[s0, s1] -> (32, s0 - s1)>()[%11, %28]
%33 = memref.subview %12[%28, 0] [%32, 10] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%34 = affine.min affine_map<()[s0, s1] -> (32, s0 - s1)>()[%13, %30]
%35 = memref.subview %14[0, %30] [10, %34] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%36 = memref.subview %17[%28, %30] [%32, %34] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
br ^bb12(%c0 : index)
^bb12(%37: index): // 2 preds: ^bb11, ^bb24
%38 = cmpi slt, %37, %32 : index
cond_br %38, ^bb13(%c0 : index), ^bb25
^bb13(%39: index): // 2 preds: ^bb12, ^bb23
%40 = cmpi slt, %39, %34 : index
cond_br %40, ^bb14(%c0 : index), ^bb24
^bb14(%41: index): // 2 preds: ^bb13, ^bb22
%42 = cmpi slt, %41, %c10 : index
cond_br %42, ^bb15, ^bb23
^bb15: // pred: ^bb14
%43 = affine.min affine_map<()[s0, s1] -> (4, s0 - s1)>()[%32, %37]
%44 = affine.min affine_map<()[s0] -> (4, -s0 + 10)>()[%41]
%45 = memref.subview %33[%37, %41] [%43, %44] [1, 1] : memref<?x10xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%46 = affine.min affine_map<()[s0, s1] -> (4, s0 - s1)>()[%34, %39]
%47 = memref.subview %35[%41, %39] [%44, %46] [1, 1] : memref<10x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%48 = memref.subview %36[%37, %39] [%43, %46] [1, 1] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
br ^bb16(%c0 : index)
^bb16(%49: index): // 2 preds: ^bb15, ^bb21
%50 = cmpi slt, %49, %43 : index
cond_br %50, ^bb17(%c0 : index), ^bb22
^bb17(%51: index): // 2 preds: ^bb16, ^bb20
%52 = cmpi slt, %51, %46 : index
cond_br %52, ^bb18(%c0 : index), ^bb21
^bb18(%53: index): // 2 preds: ^bb17, ^bb19
%54 = cmpi slt, %53, %44 : index
cond_br %54, ^bb19, ^bb20
^bb19: // pred: ^bb18
%55 = memref.load %45[%49, %53] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%56 = memref.load %47[%53, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%57 = memref.load %48[%49, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%58 = mulf %55, %56 : f32
%59 = addf %57, %58 : f32
memref.store %59, %48[%49, %51] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
%60 = addi %53, %c1 : index
br ^bb18(%60 : index)
^bb20: // pred: ^bb18
%61 = addi %51, %c1 : index
br ^bb17(%61 : index)
^bb21: // pred: ^bb17
%62 = addi %49, %c1 : index
br ^bb16(%62 : index)
^bb22: // pred: ^bb16
%63 = addi %41, %c4 : index
br ^bb14(%63 : index)
^bb23: // pred: ^bb14
%64 = addi %39, %c4 : index
br ^bb13(%64 : index)
^bb24: // pred: ^bb13
%65 = addi %37, %c4 : index
br ^bb12(%65 : index)
^bb25: // pred: ^bb12
%66 = addi %30, %c32 : index
br ^bb10(%66 : index)
^bb26: // pred: ^bb10
%67 = addi %28, %c32 : index
br ^bb9(%67 : index)
^bb27: // pred: ^bb9
%68 = addi %9, %8 : index
br ^bb3(%68 : index)
^bb28: // pred: ^bb3
%69 = addi %5, %4 : index
br ^bb1(%69 : index)
^bb29: // pred: ^bb1
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After ConvertToLLVM //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
llvm.func internal @matmul_test_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 {
%0 = llvm.mlir.constant(0 : index) : i64
%1 = llvm.mlir.constant(10 : index) : i64
%2 = llvm.mlir.constant(0.000000e+00 : f32) : f32
%3 = llvm.mlir.constant(1.000000e+00 : f32) : f32
%4 = llvm.mlir.constant(1 : index) : i64
%5 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.mlir.constant(0 : i64) : i64
%8 = llvm.getelementptr %6[%7] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%9 = llvm.load %8 : !llvm.ptr<ptr<i8>>
%10 = llvm.getelementptr %9[%0] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%11 = llvm.bitcast %10 : !llvm.ptr<i8> to !llvm.ptr<f32>
%12 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%13 = llvm.insertvalue %11, %12[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%14 = llvm.insertvalue %11, %13[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%15 = llvm.mlir.constant(0 : index) : i64
%16 = llvm.insertvalue %15, %14[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%17 = llvm.mlir.constant(10 : index) : i64
%18 = llvm.insertvalue %17, %16[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%19 = llvm.mlir.constant(10 : index) : i64
%20 = llvm.insertvalue %19, %18[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%21 = llvm.mlir.constant(10 : index) : i64
%22 = llvm.insertvalue %21, %20[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%23 = llvm.mlir.constant(1 : index) : i64
%24 = llvm.insertvalue %23, %22[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%25 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%26 = llvm.extractvalue %25[0] : !llvm.array<3 x i32>
%27 = llvm.zext %26 : i32 to i64
%28 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.extractvalue %29[0] : !llvm.array<3 x i32>
%31 = llvm.zext %30 : i32 to i64
%32 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%33 = llvm.extractvalue %32[1] : !llvm.array<3 x i32>
%34 = llvm.zext %33 : i32 to i64
%35 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%36 = llvm.extractvalue %35[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%37 = llvm.extractvalue %36[1] : !llvm.array<3 x i32>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.mlir.constant(64 : index) : i64
%40 = llvm.mul %34, %39 : i64
%41 = llvm.mlir.constant(64 : index) : i64
%42 = llvm.mul %38, %41 : i64
llvm.br ^bb1(%40 : i64)
^bb1(%43: i64): // 2 preds: ^bb0, ^bb10
%44 = llvm.icmp "slt" %43, %1 : i64
llvm.cond_br %44, ^bb2, ^bb11
^bb2: // pred: ^bb1
%45 = llvm.mlir.constant(64 : index) : i64
%46 = llvm.mul %27, %45 : i64
%47 = llvm.mlir.constant(64 : index) : i64
%48 = llvm.mul %31, %47 : i64
llvm.br ^bb3(%46 : i64)
^bb3(%49: i64): // 2 preds: ^bb2, ^bb9
%50 = llvm.icmp "slt" %49, %1 : i64
llvm.cond_br %50, ^bb4, ^bb10
^bb4: // pred: ^bb3
%51 = llvm.mlir.constant(64 : index) : i64
%52 = llvm.mlir.constant(-1 : index) : i64
%53 = llvm.mul %43, %52 : i64
%54 = llvm.mlir.constant(10 : index) : i64
%55 = llvm.add %53, %54 : i64
%56 = llvm.icmp "slt" %51, %55 : i64
%57 = llvm.select %56, %51, %55 : i1, i64
%58 = llvm.mlir.constant(64 : index) : i64
%59 = llvm.mlir.constant(-1 : index) : i64
%60 = llvm.mul %49, %59 : i64
%61 = llvm.mlir.constant(10 : index) : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.icmp "slt" %58, %62 : i64
%64 = llvm.select %63, %58, %62 : i1, i64
%65 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%66 = llvm.extractvalue %24[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%67 = llvm.bitcast %66 : !llvm.ptr<f32> to !llvm.ptr<f32>
%68 = llvm.insertvalue %67, %65[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%69 = llvm.extractvalue %24[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%70 = llvm.bitcast %69 : !llvm.ptr<f32> to !llvm.ptr<f32>
%71 = llvm.insertvalue %70, %68[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%72 = llvm.extractvalue %24[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%73 = llvm.extractvalue %24[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%74 = llvm.extractvalue %24[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%75 = llvm.mul %43, %72 : i64
%76 = llvm.add %74, %75 : i64
%77 = llvm.mul %49, %73 : i64
%78 = llvm.add %76, %77 : i64
%79 = llvm.insertvalue %78, %71[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%80 = llvm.mlir.constant(1 : i64) : i64
%81 = llvm.insertvalue %64, %79[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%82 = llvm.insertvalue %80, %81[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%83 = llvm.mlir.constant(10 : i64) : i64
%84 = llvm.insertvalue %57, %82[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%85 = llvm.insertvalue %83, %84[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
llvm.br ^bb5(%0 : i64)
^bb5(%86: i64): // 2 preds: ^bb4, ^bb8
%87 = llvm.icmp "slt" %86, %57 : i64
llvm.cond_br %87, ^bb6(%0 : i64), ^bb9
^bb6(%88: i64): // 2 preds: ^bb5, ^bb7
%89 = llvm.icmp "slt" %88, %64 : i64
llvm.cond_br %89, ^bb7, ^bb8
^bb7: // pred: ^bb6
%90 = llvm.add %86, %43 : i64
%91 = llvm.add %88, %49 : i64
%92 = llvm.icmp "eq" %90, %91 : i64
%93 = llvm.select %92, %2, %3 : i1, f32
%94 = llvm.extractvalue %85[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%95 = llvm.extractvalue %85[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%96 = llvm.mlir.constant(10 : index) : i64
%97 = llvm.mul %86, %96 : i64
%98 = llvm.add %95, %97 : i64
%99 = llvm.add %98, %88 : i64
%100 = llvm.getelementptr %94[%99] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
llvm.store %93, %100 : !llvm.ptr<f32>
%101 = llvm.add %88, %4 : i64
llvm.br ^bb6(%101 : i64)
^bb8: // pred: ^bb6
%102 = llvm.add %86, %4 : i64
llvm.br ^bb5(%102 : i64)
^bb9: // pred: ^bb5
%103 = llvm.add %49, %48 : i64
llvm.br ^bb3(%103 : i64)
^bb10: // pred: ^bb3
%104 = llvm.add %43, %42 : i64
llvm.br ^bb1(%104 : i64)
^bb11: // pred: ^bb1
%105 = llvm.mlir.constant(0 : i32) : i32
llvm.return %105 : i32
}
}
// -----// IR Dump After LLVMCPUSynchronizeSymbolVisibility //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
llvm.func internal @matmul_test_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : index) : i64
%1 = llvm.mlir.constant(10 : index) : i64
%2 = llvm.mlir.constant(0.000000e+00 : f32) : f32
%3 = llvm.mlir.constant(1.000000e+00 : f32) : f32
%4 = llvm.mlir.constant(1 : index) : i64
%5 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.mlir.constant(0 : i64) : i64
%8 = llvm.getelementptr %6[%7] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%9 = llvm.load %8 : !llvm.ptr<ptr<i8>>
%10 = llvm.getelementptr %9[%0] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%11 = llvm.bitcast %10 : !llvm.ptr<i8> to !llvm.ptr<f32>
%12 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%13 = llvm.insertvalue %11, %12[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%14 = llvm.insertvalue %11, %13[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%15 = llvm.mlir.constant(0 : index) : i64
%16 = llvm.insertvalue %15, %14[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%17 = llvm.mlir.constant(10 : index) : i64
%18 = llvm.insertvalue %17, %16[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%19 = llvm.mlir.constant(10 : index) : i64
%20 = llvm.insertvalue %19, %18[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%21 = llvm.mlir.constant(10 : index) : i64
%22 = llvm.insertvalue %21, %20[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%23 = llvm.mlir.constant(1 : index) : i64
%24 = llvm.insertvalue %23, %22[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%25 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%26 = llvm.extractvalue %25[0] : !llvm.array<3 x i32>
%27 = llvm.zext %26 : i32 to i64
%28 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%29 = llvm.extractvalue %28[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%30 = llvm.extractvalue %29[0] : !llvm.array<3 x i32>
%31 = llvm.zext %30 : i32 to i64
%32 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%33 = llvm.extractvalue %32[1] : !llvm.array<3 x i32>
%34 = llvm.zext %33 : i32 to i64
%35 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%36 = llvm.extractvalue %35[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%37 = llvm.extractvalue %36[1] : !llvm.array<3 x i32>
%38 = llvm.zext %37 : i32 to i64
%39 = llvm.mlir.constant(64 : index) : i64
%40 = llvm.mul %34, %39 : i64
%41 = llvm.mlir.constant(64 : index) : i64
%42 = llvm.mul %38, %41 : i64
llvm.br ^bb1(%40 : i64)
^bb1(%43: i64): // 2 preds: ^bb0, ^bb10
%44 = llvm.icmp "slt" %43, %1 : i64
llvm.cond_br %44, ^bb2, ^bb11
^bb2: // pred: ^bb1
%45 = llvm.mlir.constant(64 : index) : i64
%46 = llvm.mul %27, %45 : i64
%47 = llvm.mlir.constant(64 : index) : i64
%48 = llvm.mul %31, %47 : i64
llvm.br ^bb3(%46 : i64)
^bb3(%49: i64): // 2 preds: ^bb2, ^bb9
%50 = llvm.icmp "slt" %49, %1 : i64
llvm.cond_br %50, ^bb4, ^bb10
^bb4: // pred: ^bb3
%51 = llvm.mlir.constant(64 : index) : i64
%52 = llvm.mlir.constant(-1 : index) : i64
%53 = llvm.mul %43, %52 : i64
%54 = llvm.mlir.constant(10 : index) : i64
%55 = llvm.add %53, %54 : i64
%56 = llvm.icmp "slt" %51, %55 : i64
%57 = llvm.select %56, %51, %55 : i1, i64
%58 = llvm.mlir.constant(64 : index) : i64
%59 = llvm.mlir.constant(-1 : index) : i64
%60 = llvm.mul %49, %59 : i64
%61 = llvm.mlir.constant(10 : index) : i64
%62 = llvm.add %60, %61 : i64
%63 = llvm.icmp "slt" %58, %62 : i64
%64 = llvm.select %63, %58, %62 : i1, i64
%65 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%66 = llvm.extractvalue %24[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%67 = llvm.bitcast %66 : !llvm.ptr<f32> to !llvm.ptr<f32>
%68 = llvm.insertvalue %67, %65[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%69 = llvm.extractvalue %24[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%70 = llvm.bitcast %69 : !llvm.ptr<f32> to !llvm.ptr<f32>
%71 = llvm.insertvalue %70, %68[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%72 = llvm.extractvalue %24[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%73 = llvm.extractvalue %24[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%74 = llvm.extractvalue %24[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%75 = llvm.mul %43, %72 : i64
%76 = llvm.add %74, %75 : i64
%77 = llvm.mul %49, %73 : i64
%78 = llvm.add %76, %77 : i64
%79 = llvm.insertvalue %78, %71[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%80 = llvm.mlir.constant(1 : i64) : i64
%81 = llvm.insertvalue %64, %79[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%82 = llvm.insertvalue %80, %81[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%83 = llvm.mlir.constant(10 : i64) : i64
%84 = llvm.insertvalue %57, %82[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%85 = llvm.insertvalue %83, %84[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
llvm.br ^bb5(%0 : i64)
^bb5(%86: i64): // 2 preds: ^bb4, ^bb8
%87 = llvm.icmp "slt" %86, %57 : i64
llvm.cond_br %87, ^bb6(%0 : i64), ^bb9
^bb6(%88: i64): // 2 preds: ^bb5, ^bb7
%89 = llvm.icmp "slt" %88, %64 : i64
llvm.cond_br %89, ^bb7, ^bb8
^bb7: // pred: ^bb6
%90 = llvm.add %86, %43 : i64
%91 = llvm.add %88, %49 : i64
%92 = llvm.icmp "eq" %90, %91 : i64
%93 = llvm.select %92, %2, %3 : i1, f32
%94 = llvm.extractvalue %85[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%95 = llvm.extractvalue %85[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%96 = llvm.mlir.constant(10 : index) : i64
%97 = llvm.mul %86, %96 : i64
%98 = llvm.add %95, %97 : i64
%99 = llvm.add %98, %88 : i64
%100 = llvm.getelementptr %94[%99] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
llvm.store %93, %100 : !llvm.ptr<f32>
%101 = llvm.add %88, %4 : i64
llvm.br ^bb6(%101 : i64)
^bb8: // pred: ^bb6
%102 = llvm.add %86, %4 : i64
llvm.br ^bb5(%102 : i64)
^bb9: // pred: ^bb5
%103 = llvm.add %49, %48 : i64
llvm.br ^bb3(%103 : i64)
^bb10: // pred: ^bb3
%104 = llvm.add %43, %42 : i64
llvm.br ^bb1(%104 : i64)
^bb11: // pred: ^bb1
%105 = llvm.mlir.constant(0 : i32) : i32
llvm.return %105 : i32
}
}
// -----// IR Dump After ConvertToLLVM //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
llvm.func internal @matmul_test_dispatch_2(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 {
%0 = llvm.mlir.constant(4 : index) : i64
%1 = llvm.mlir.constant(32 : index) : i64
%2 = llvm.mlir.constant(0 : index) : i64
%3 = llvm.mlir.constant(10 : index) : i64
%4 = llvm.mlir.constant(0.000000e+00 : f32) : f32
%5 = llvm.mlir.constant(1.000000e+00 : f32) : f32
%6 = llvm.mlir.constant(1 : index) : i64
%7 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%8 = llvm.extractvalue %7[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%9 = llvm.mlir.constant(0 : i64) : i64
%10 = llvm.getelementptr %8[%9] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%11 = llvm.load %10 : !llvm.ptr<ptr<i8>>
%12 = llvm.getelementptr %11[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%13 = llvm.bitcast %12 : !llvm.ptr<i8> to !llvm.ptr<f32>
%14 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%15 = llvm.insertvalue %13, %14[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%16 = llvm.insertvalue %13, %15[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%17 = llvm.mlir.constant(0 : index) : i64
%18 = llvm.insertvalue %17, %16[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%19 = llvm.mlir.constant(10 : index) : i64
%20 = llvm.insertvalue %19, %18[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%21 = llvm.mlir.constant(10 : index) : i64
%22 = llvm.insertvalue %21, %20[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%23 = llvm.mlir.constant(10 : index) : i64
%24 = llvm.insertvalue %23, %22[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%25 = llvm.mlir.constant(1 : index) : i64
%26 = llvm.insertvalue %25, %24[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%27 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%28 = llvm.extractvalue %27[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%29 = llvm.mlir.constant(1 : i64) : i64
%30 = llvm.getelementptr %28[%29] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.getelementptr %31[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%33 = llvm.bitcast %32 : !llvm.ptr<i8> to !llvm.ptr<f32>
%34 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%35 = llvm.insertvalue %33, %34[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%36 = llvm.insertvalue %33, %35[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%37 = llvm.mlir.constant(0 : index) : i64
%38 = llvm.insertvalue %37, %36[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%39 = llvm.mlir.constant(10 : index) : i64
%40 = llvm.insertvalue %39, %38[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%41 = llvm.mlir.constant(10 : index) : i64
%42 = llvm.insertvalue %41, %40[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%43 = llvm.mlir.constant(10 : index) : i64
%44 = llvm.insertvalue %43, %42[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%45 = llvm.mlir.constant(1 : index) : i64
%46 = llvm.insertvalue %45, %44[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%47 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%48 = llvm.extractvalue %47[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%49 = llvm.mlir.constant(2 : i64) : i64
%50 = llvm.getelementptr %48[%49] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%51 = llvm.load %50 : !llvm.ptr<ptr<i8>>
%52 = llvm.getelementptr %51[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%53 = llvm.bitcast %52 : !llvm.ptr<i8> to !llvm.ptr<f32>
%54 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%55 = llvm.insertvalue %53, %54[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%56 = llvm.insertvalue %53, %55[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%57 = llvm.mlir.constant(0 : index) : i64
%58 = llvm.insertvalue %57, %56[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%59 = llvm.mlir.constant(10 : index) : i64
%60 = llvm.insertvalue %59, %58[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%61 = llvm.mlir.constant(10 : index) : i64
%62 = llvm.insertvalue %61, %60[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%63 = llvm.mlir.constant(10 : index) : i64
%64 = llvm.insertvalue %63, %62[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%65 = llvm.mlir.constant(1 : index) : i64
%66 = llvm.insertvalue %65, %64[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%67 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%68 = llvm.extractvalue %67[0] : !llvm.array<3 x i32>
%69 = llvm.zext %68 : i32 to i64
%70 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%71 = llvm.extractvalue %70[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%72 = llvm.extractvalue %71[0] : !llvm.array<3 x i32>
%73 = llvm.zext %72 : i32 to i64
%74 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%75 = llvm.extractvalue %74[1] : !llvm.array<3 x i32>
%76 = llvm.zext %75 : i32 to i64
%77 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%78 = llvm.extractvalue %77[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%79 = llvm.extractvalue %78[1] : !llvm.array<3 x i32>
%80 = llvm.zext %79 : i32 to i64
%81 = llvm.mlir.constant(64 : index) : i64
%82 = llvm.mul %76, %81 : i64
%83 = llvm.mlir.constant(64 : index) : i64
%84 = llvm.mul %80, %83 : i64
llvm.br ^bb1(%82 : i64)
^bb1(%85: i64): // 2 preds: ^bb0, ^bb28
%86 = llvm.icmp "slt" %85, %3 : i64
llvm.cond_br %86, ^bb2, ^bb29
^bb2: // pred: ^bb1
%87 = llvm.mlir.constant(64 : index) : i64
%88 = llvm.mul %69, %87 : i64
%89 = llvm.mlir.constant(64 : index) : i64
%90 = llvm.mul %73, %89 : i64
llvm.br ^bb3(%88 : i64)
^bb3(%91: i64): // 2 preds: ^bb2, ^bb27
%92 = llvm.icmp "slt" %91, %3 : i64
llvm.cond_br %92, ^bb4, ^bb28
^bb4: // pred: ^bb3
%93 = llvm.mlir.constant(64 : index) : i64
%94 = llvm.mlir.constant(-1 : index) : i64
%95 = llvm.mul %85, %94 : i64
%96 = llvm.mlir.constant(10 : index) : i64
%97 = llvm.add %95, %96 : i64
%98 = llvm.icmp "slt" %93, %97 : i64
%99 = llvm.select %98, %93, %97 : i1, i64
%100 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%101 = llvm.extractvalue %26[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%102 = llvm.bitcast %101 : !llvm.ptr<f32> to !llvm.ptr<f32>
%103 = llvm.insertvalue %102, %100[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%104 = llvm.extractvalue %26[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%105 = llvm.bitcast %104 : !llvm.ptr<f32> to !llvm.ptr<f32>
%106 = llvm.insertvalue %105, %103[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%107 = llvm.extractvalue %26[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%108 = llvm.extractvalue %26[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%109 = llvm.extractvalue %26[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%110 = llvm.mul %85, %107 : i64
%111 = llvm.add %109, %110 : i64
%112 = llvm.mlir.constant(0 : i64) : i64
%113 = llvm.mul %112, %108 : i64
%114 = llvm.add %111, %113 : i64
%115 = llvm.insertvalue %114, %106[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%116 = llvm.mlir.constant(10 : i64) : i64
%117 = llvm.mlir.constant(1 : i64) : i64
%118 = llvm.insertvalue %116, %115[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%119 = llvm.insertvalue %117, %118[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%120 = llvm.mlir.constant(10 : i64) : i64
%121 = llvm.insertvalue %99, %119[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%122 = llvm.insertvalue %120, %121[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%123 = llvm.mlir.constant(64 : index) : i64
%124 = llvm.mlir.constant(-1 : index) : i64
%125 = llvm.mul %91, %124 : i64
%126 = llvm.mlir.constant(10 : index) : i64
%127 = llvm.add %125, %126 : i64
%128 = llvm.icmp "slt" %123, %127 : i64
%129 = llvm.select %128, %123, %127 : i1, i64
%130 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%131 = llvm.extractvalue %46[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%132 = llvm.bitcast %131 : !llvm.ptr<f32> to !llvm.ptr<f32>
%133 = llvm.insertvalue %132, %130[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%134 = llvm.extractvalue %46[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%135 = llvm.bitcast %134 : !llvm.ptr<f32> to !llvm.ptr<f32>
%136 = llvm.insertvalue %135, %133[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%137 = llvm.extractvalue %46[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%138 = llvm.extractvalue %46[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%139 = llvm.extractvalue %46[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%140 = llvm.mlir.constant(0 : i64) : i64
%141 = llvm.mul %140, %137 : i64
%142 = llvm.add %139, %141 : i64
%143 = llvm.mul %91, %138 : i64
%144 = llvm.add %142, %143 : i64
%145 = llvm.insertvalue %144, %136[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%146 = llvm.mlir.constant(1 : i64) : i64
%147 = llvm.insertvalue %129, %145[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%148 = llvm.insertvalue %146, %147[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%149 = llvm.mlir.constant(10 : i64) : i64
%150 = llvm.mlir.constant(10 : i64) : i64
%151 = llvm.insertvalue %149, %148[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%152 = llvm.insertvalue %150, %151[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%153 = llvm.mlir.constant(-1 : index) : i64
%154 = llvm.mul %85, %153 : i64
%155 = llvm.mlir.constant(10 : index) : i64
%156 = llvm.add %154, %155 : i64
%157 = llvm.mlir.constant(64 : index) : i64
%158 = llvm.icmp "slt" %156, %157 : i64
%159 = llvm.select %158, %156, %157 : i1, i64
%160 = llvm.mlir.constant(-1 : index) : i64
%161 = llvm.mul %91, %160 : i64
%162 = llvm.mlir.constant(10 : index) : i64
%163 = llvm.add %161, %162 : i64
%164 = llvm.mlir.constant(64 : index) : i64
%165 = llvm.icmp "slt" %163, %164 : i64
%166 = llvm.select %165, %163, %164 : i1, i64
%167 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%168 = llvm.extractvalue %66[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%169 = llvm.bitcast %168 : !llvm.ptr<f32> to !llvm.ptr<f32>
%170 = llvm.insertvalue %169, %167[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%171 = llvm.extractvalue %66[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%172 = llvm.bitcast %171 : !llvm.ptr<f32> to !llvm.ptr<f32>
%173 = llvm.insertvalue %172, %170[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%174 = llvm.extractvalue %66[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%175 = llvm.extractvalue %66[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%176 = llvm.extractvalue %66[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%177 = llvm.mul %85, %174 : i64
%178 = llvm.add %176, %177 : i64
%179 = llvm.mul %91, %175 : i64
%180 = llvm.add %178, %179 : i64
%181 = llvm.insertvalue %180, %173[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%182 = llvm.mlir.constant(1 : i64) : i64
%183 = llvm.insertvalue %129, %181[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%184 = llvm.insertvalue %182, %183[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%185 = llvm.mlir.constant(10 : i64) : i64
%186 = llvm.insertvalue %99, %184[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%187 = llvm.insertvalue %185, %186[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
llvm.br ^bb5(%2 : i64)
^bb5(%188: i64): // 2 preds: ^bb4, ^bb8
%189 = llvm.icmp "slt" %188, %159 : i64
llvm.cond_br %189, ^bb6(%2 : i64), ^bb9(%2 : i64)
^bb6(%190: i64): // 2 preds: ^bb5, ^bb7
%191 = llvm.icmp "slt" %190, %166 : i64
llvm.cond_br %191, ^bb7, ^bb8
^bb7: // pred: ^bb6
%192 = llvm.add %188, %85 : i64
%193 = llvm.add %190, %91 : i64
%194 = llvm.icmp "eq" %192, %193 : i64
%195 = llvm.select %194, %4, %5 : i1, f32
%196 = llvm.extractvalue %187[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%197 = llvm.extractvalue %187[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%198 = llvm.mlir.constant(10 : index) : i64
%199 = llvm.mul %188, %198 : i64
%200 = llvm.add %197, %199 : i64
%201 = llvm.add %200, %190 : i64
%202 = llvm.getelementptr %196[%201] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
llvm.store %195, %202 : !llvm.ptr<f32>
%203 = llvm.add %190, %6 : i64
llvm.br ^bb6(%203 : i64)
^bb8: // pred: ^bb6
%204 = llvm.add %188, %6 : i64
llvm.br ^bb5(%204 : i64)
^bb9(%205: i64): // 2 preds: ^bb5, ^bb26
%206 = llvm.icmp "slt" %205, %99 : i64
llvm.cond_br %206, ^bb10(%2 : i64), ^bb27
^bb10(%207: i64): // 2 preds: ^bb9, ^bb25
%208 = llvm.icmp "slt" %207, %129 : i64
llvm.cond_br %208, ^bb11, ^bb26
^bb11: // pred: ^bb10
%209 = llvm.mlir.constant(32 : index) : i64
%210 = llvm.mlir.constant(-1 : index) : i64
%211 = llvm.mul %205, %210 : i64
%212 = llvm.add %99, %211 : i64
%213 = llvm.icmp "slt" %209, %212 : i64
%214 = llvm.select %213, %209, %212 : i1, i64
%215 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%216 = llvm.extractvalue %122[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%217 = llvm.bitcast %216 : !llvm.ptr<f32> to !llvm.ptr<f32>
%218 = llvm.insertvalue %217, %215[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%219 = llvm.extractvalue %122[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%220 = llvm.bitcast %219 : !llvm.ptr<f32> to !llvm.ptr<f32>
%221 = llvm.insertvalue %220, %218[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%222 = llvm.extractvalue %122[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%223 = llvm.extractvalue %122[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%224 = llvm.extractvalue %122[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%225 = llvm.mul %205, %222 : i64
%226 = llvm.add %224, %225 : i64
%227 = llvm.mlir.constant(0 : i64) : i64
%228 = llvm.mul %227, %223 : i64
%229 = llvm.add %226, %228 : i64
%230 = llvm.insertvalue %229, %221[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%231 = llvm.mlir.constant(10 : i64) : i64
%232 = llvm.mlir.constant(1 : i64) : i64
%233 = llvm.insertvalue %231, %230[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%234 = llvm.insertvalue %232, %233[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%235 = llvm.mlir.constant(10 : i64) : i64
%236 = llvm.insertvalue %214, %234[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%237 = llvm.insertvalue %235, %236[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%238 = llvm.mlir.constant(32 : index) : i64
%239 = llvm.mlir.constant(-1 : index) : i64
%240 = llvm.mul %207, %239 : i64
%241 = llvm.add %129, %240 : i64
%242 = llvm.icmp "slt" %238, %241 : i64
%243 = llvm.select %242, %238, %241 : i1, i64
%244 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%245 = llvm.extractvalue %152[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%246 = llvm.bitcast %245 : !llvm.ptr<f32> to !llvm.ptr<f32>
%247 = llvm.insertvalue %246, %244[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%248 = llvm.extractvalue %152[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%249 = llvm.bitcast %248 : !llvm.ptr<f32> to !llvm.ptr<f32>
%250 = llvm.insertvalue %249, %247[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%251 = llvm.extractvalue %152[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%252 = llvm.extractvalue %152[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%253 = llvm.extractvalue %152[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%254 = llvm.mlir.constant(0 : i64) : i64
%255 = llvm.mul %254, %251 : i64
%256 = llvm.add %253, %255 : i64
%257 = llvm.mul %207, %252 : i64
%258 = llvm.add %256, %257 : i64
%259 = llvm.insertvalue %258, %250[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%260 = llvm.mlir.constant(1 : i64) : i64
%261 = llvm.insertvalue %243, %259[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%262 = llvm.insertvalue %260, %261[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%263 = llvm.mlir.constant(10 : i64) : i64
%264 = llvm.mlir.constant(10 : i64) : i64
%265 = llvm.insertvalue %263, %262[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%266 = llvm.insertvalue %264, %265[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%267 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%268 = llvm.extractvalue %187[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%269 = llvm.bitcast %268 : !llvm.ptr<f32> to !llvm.ptr<f32>
%270 = llvm.insertvalue %269, %267[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%271 = llvm.extractvalue %187[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%272 = llvm.bitcast %271 : !llvm.ptr<f32> to !llvm.ptr<f32>
%273 = llvm.insertvalue %272, %270[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%274 = llvm.extractvalue %187[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%275 = llvm.extractvalue %187[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%276 = llvm.extractvalue %187[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%277 = llvm.mul %205, %274 : i64
%278 = llvm.add %276, %277 : i64
%279 = llvm.mul %207, %275 : i64
%280 = llvm.add %278, %279 : i64
%281 = llvm.insertvalue %280, %273[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%282 = llvm.mlir.constant(1 : i64) : i64
%283 = llvm.insertvalue %243, %281[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%284 = llvm.insertvalue %282, %283[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%285 = llvm.mlir.constant(10 : i64) : i64
%286 = llvm.insertvalue %214, %284[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%287 = llvm.insertvalue %285, %286[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
llvm.br ^bb12(%2 : i64)
^bb12(%288: i64): // 2 preds: ^bb11, ^bb24
%289 = llvm.icmp "slt" %288, %214 : i64
llvm.cond_br %289, ^bb13(%2 : i64), ^bb25
^bb13(%290: i64): // 2 preds: ^bb12, ^bb23
%291 = llvm.icmp "slt" %290, %243 : i64
llvm.cond_br %291, ^bb14(%2 : i64), ^bb24
^bb14(%292: i64): // 2 preds: ^bb13, ^bb22
%293 = llvm.icmp "slt" %292, %3 : i64
llvm.cond_br %293, ^bb15, ^bb23
^bb15: // pred: ^bb14
%294 = llvm.mlir.constant(4 : index) : i64
%295 = llvm.mlir.constant(-1 : index) : i64
%296 = llvm.mul %288, %295 : i64
%297 = llvm.add %214, %296 : i64
%298 = llvm.icmp "slt" %294, %297 : i64
%299 = llvm.select %298, %294, %297 : i1, i64
%300 = llvm.mlir.constant(4 : index) : i64
%301 = llvm.mlir.constant(-1 : index) : i64
%302 = llvm.mul %292, %301 : i64
%303 = llvm.mlir.constant(10 : index) : i64
%304 = llvm.add %302, %303 : i64
%305 = llvm.icmp "slt" %300, %304 : i64
%306 = llvm.select %305, %300, %304 : i1, i64
%307 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%308 = llvm.extractvalue %237[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%309 = llvm.bitcast %308 : !llvm.ptr<f32> to !llvm.ptr<f32>
%310 = llvm.insertvalue %309, %307[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%311 = llvm.extractvalue %237[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%312 = llvm.bitcast %311 : !llvm.ptr<f32> to !llvm.ptr<f32>
%313 = llvm.insertvalue %312, %310[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%314 = llvm.extractvalue %237[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%315 = llvm.extractvalue %237[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%316 = llvm.extractvalue %237[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%317 = llvm.mul %288, %314 : i64
%318 = llvm.add %316, %317 : i64
%319 = llvm.mul %292, %315 : i64
%320 = llvm.add %318, %319 : i64
%321 = llvm.insertvalue %320, %313[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%322 = llvm.mlir.constant(1 : i64) : i64
%323 = llvm.insertvalue %306, %321[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%324 = llvm.insertvalue %322, %323[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%325 = llvm.mlir.constant(10 : i64) : i64
%326 = llvm.insertvalue %299, %324[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%327 = llvm.insertvalue %325, %326[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%328 = llvm.mlir.constant(4 : index) : i64
%329 = llvm.mlir.constant(-1 : index) : i64
%330 = llvm.mul %290, %329 : i64
%331 = llvm.add %243, %330 : i64
%332 = llvm.icmp "slt" %328, %331 : i64
%333 = llvm.select %332, %328, %331 : i1, i64
%334 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%335 = llvm.extractvalue %266[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%336 = llvm.bitcast %335 : !llvm.ptr<f32> to !llvm.ptr<f32>
%337 = llvm.insertvalue %336, %334[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%338 = llvm.extractvalue %266[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%339 = llvm.bitcast %338 : !llvm.ptr<f32> to !llvm.ptr<f32>
%340 = llvm.insertvalue %339, %337[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%341 = llvm.extractvalue %266[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%342 = llvm.extractvalue %266[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%343 = llvm.extractvalue %266[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%344 = llvm.mul %292, %341 : i64
%345 = llvm.add %343, %344 : i64
%346 = llvm.mul %290, %342 : i64
%347 = llvm.add %345, %346 : i64
%348 = llvm.insertvalue %347, %340[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%349 = llvm.mlir.constant(1 : i64) : i64
%350 = llvm.insertvalue %333, %348[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%351 = llvm.insertvalue %349, %350[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%352 = llvm.mlir.constant(10 : i64) : i64
%353 = llvm.insertvalue %306, %351[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%354 = llvm.insertvalue %352, %353[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%355 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%356 = llvm.extractvalue %287[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%357 = llvm.bitcast %356 : !llvm.ptr<f32> to !llvm.ptr<f32>
%358 = llvm.insertvalue %357, %355[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%359 = llvm.extractvalue %287[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%360 = llvm.bitcast %359 : !llvm.ptr<f32> to !llvm.ptr<f32>
%361 = llvm.insertvalue %360, %358[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%362 = llvm.extractvalue %287[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%363 = llvm.extractvalue %287[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%364 = llvm.extractvalue %287[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%365 = llvm.mul %288, %362 : i64
%366 = llvm.add %364, %365 : i64
%367 = llvm.mul %290, %363 : i64
%368 = llvm.add %366, %367 : i64
%369 = llvm.insertvalue %368, %361[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%370 = llvm.mlir.constant(1 : i64) : i64
%371 = llvm.insertvalue %333, %369[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%372 = llvm.insertvalue %370, %371[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%373 = llvm.mlir.constant(10 : i64) : i64
%374 = llvm.insertvalue %299, %372[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%375 = llvm.insertvalue %373, %374[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
llvm.br ^bb16(%2 : i64)
^bb16(%376: i64): // 2 preds: ^bb15, ^bb21
%377 = llvm.icmp "slt" %376, %299 : i64
llvm.cond_br %377, ^bb17(%2 : i64), ^bb22
^bb17(%378: i64): // 2 preds: ^bb16, ^bb20
%379 = llvm.icmp "slt" %378, %333 : i64
llvm.cond_br %379, ^bb18(%2 : i64), ^bb21
^bb18(%380: i64): // 2 preds: ^bb17, ^bb19
%381 = llvm.icmp "slt" %380, %306 : i64
llvm.cond_br %381, ^bb19, ^bb20
^bb19: // pred: ^bb18
%382 = llvm.extractvalue %327[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%383 = llvm.extractvalue %327[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%384 = llvm.mlir.constant(10 : index) : i64
%385 = llvm.mul %376, %384 : i64
%386 = llvm.add %383, %385 : i64
%387 = llvm.add %386, %380 : i64
%388 = llvm.getelementptr %382[%387] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%389 = llvm.load %388 : !llvm.ptr<f32>
%390 = llvm.extractvalue %354[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%391 = llvm.extractvalue %354[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%392 = llvm.mlir.constant(10 : index) : i64
%393 = llvm.mul %380, %392 : i64
%394 = llvm.add %391, %393 : i64
%395 = llvm.add %394, %378 : i64
%396 = llvm.getelementptr %390[%395] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%397 = llvm.load %396 : !llvm.ptr<f32>
%398 = llvm.extractvalue %375[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%399 = llvm.extractvalue %375[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%400 = llvm.mlir.constant(10 : index) : i64
%401 = llvm.mul %376, %400 : i64
%402 = llvm.add %399, %401 : i64
%403 = llvm.add %402, %378 : i64
%404 = llvm.getelementptr %398[%403] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%405 = llvm.load %404 : !llvm.ptr<f32>
%406 = llvm.fmul %389, %397 : f32
%407 = llvm.fadd %405, %406 : f32
%408 = llvm.extractvalue %375[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%409 = llvm.extractvalue %375[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%410 = llvm.mlir.constant(10 : index) : i64
%411 = llvm.mul %376, %410 : i64
%412 = llvm.add %409, %411 : i64
%413 = llvm.add %412, %378 : i64
%414 = llvm.getelementptr %408[%413] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
llvm.store %407, %414 : !llvm.ptr<f32>
%415 = llvm.add %380, %6 : i64
llvm.br ^bb18(%415 : i64)
^bb20: // pred: ^bb18
%416 = llvm.add %378, %6 : i64
llvm.br ^bb17(%416 : i64)
^bb21: // pred: ^bb17
%417 = llvm.add %376, %6 : i64
llvm.br ^bb16(%417 : i64)
^bb22: // pred: ^bb16
%418 = llvm.add %292, %0 : i64
llvm.br ^bb14(%418 : i64)
^bb23: // pred: ^bb14
%419 = llvm.add %290, %0 : i64
llvm.br ^bb13(%419 : i64)
^bb24: // pred: ^bb13
%420 = llvm.add %288, %0 : i64
llvm.br ^bb12(%420 : i64)
^bb25: // pred: ^bb12
%421 = llvm.add %207, %1 : i64
llvm.br ^bb10(%421 : i64)
^bb26: // pred: ^bb10
%422 = llvm.add %205, %1 : i64
llvm.br ^bb9(%422 : i64)
^bb27: // pred: ^bb9
%423 = llvm.add %91, %90 : i64
llvm.br ^bb3(%423 : i64)
^bb28: // pred: ^bb3
%424 = llvm.add %85, %84 : i64
llvm.br ^bb1(%424 : i64)
^bb29: // pred: ^bb1
%425 = llvm.mlir.constant(0 : i32) : i32
llvm.return %425 : i32
}
}
// -----// IR Dump After Canonicalizer //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
llvm.func internal @matmul_test_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : index) : i64
%1 = llvm.mlir.constant(10 : index) : i64
%2 = llvm.mlir.constant(0.000000e+00 : f32) : f32
%3 = llvm.mlir.constant(1.000000e+00 : f32) : f32
%4 = llvm.mlir.constant(1 : index) : i64
%5 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.mlir.constant(0 : i64) : i64
%8 = llvm.getelementptr %6[%7] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%9 = llvm.load %8 : !llvm.ptr<ptr<i8>>
%10 = llvm.getelementptr %9[%0] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%11 = llvm.bitcast %10 : !llvm.ptr<i8> to !llvm.ptr<f32>
%12 = llvm.mlir.constant(0 : index) : i64
%13 = llvm.mlir.constant(10 : index) : i64
%14 = llvm.mlir.constant(1 : index) : i64
%15 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%16 = llvm.extractvalue %15[0] : !llvm.array<3 x i32>
%17 = llvm.zext %16 : i32 to i64
%18 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%19 = llvm.extractvalue %18[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%20 = llvm.extractvalue %19[0] : !llvm.array<3 x i32>
%21 = llvm.zext %20 : i32 to i64
%22 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%23 = llvm.extractvalue %22[1] : !llvm.array<3 x i32>
%24 = llvm.zext %23 : i32 to i64
%25 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%26 = llvm.extractvalue %25[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%27 = llvm.extractvalue %26[1] : !llvm.array<3 x i32>
%28 = llvm.zext %27 : i32 to i64
%29 = llvm.mlir.constant(64 : index) : i64
%30 = llvm.mul %24, %29 : i64
%31 = llvm.mlir.constant(64 : index) : i64
%32 = llvm.mul %28, %31 : i64
llvm.br ^bb1(%30 : i64)
^bb1(%33: i64): // 2 preds: ^bb0, ^bb10
%34 = llvm.icmp "slt" %33, %1 : i64
llvm.cond_br %34, ^bb2, ^bb11
^bb2: // pred: ^bb1
%35 = llvm.mlir.constant(64 : index) : i64
%36 = llvm.mul %17, %35 : i64
%37 = llvm.mlir.constant(64 : index) : i64
%38 = llvm.mul %21, %37 : i64
llvm.br ^bb3(%36 : i64)
^bb3(%39: i64): // 2 preds: ^bb2, ^bb9
%40 = llvm.icmp "slt" %39, %1 : i64
llvm.cond_br %40, ^bb4, ^bb10
^bb4: // pred: ^bb3
%41 = llvm.mlir.constant(64 : index) : i64
%42 = llvm.mlir.constant(-1 : index) : i64
%43 = llvm.mul %33, %42 : i64
%44 = llvm.mlir.constant(10 : index) : i64
%45 = llvm.add %43, %44 : i64
%46 = llvm.icmp "slt" %41, %45 : i64
%47 = llvm.select %46, %41, %45 : i1, i64
%48 = llvm.mlir.constant(64 : index) : i64
%49 = llvm.mlir.constant(-1 : index) : i64
%50 = llvm.mul %39, %49 : i64
%51 = llvm.mlir.constant(10 : index) : i64
%52 = llvm.add %50, %51 : i64
%53 = llvm.icmp "slt" %48, %52 : i64
%54 = llvm.select %53, %48, %52 : i1, i64
%55 = llvm.bitcast %11 : !llvm.ptr<f32> to !llvm.ptr<f32>
%56 = llvm.mul %33, %13 : i64
%57 = llvm.add %12, %56 : i64
%58 = llvm.mul %39, %14 : i64
%59 = llvm.add %57, %58 : i64
llvm.br ^bb5(%0 : i64)
^bb5(%60: i64): // 2 preds: ^bb4, ^bb8
%61 = llvm.icmp "slt" %60, %47 : i64
llvm.cond_br %61, ^bb6(%0 : i64), ^bb9
^bb6(%62: i64): // 2 preds: ^bb5, ^bb7
%63 = llvm.icmp "slt" %62, %54 : i64
llvm.cond_br %63, ^bb7, ^bb8
^bb7: // pred: ^bb6
%64 = llvm.add %60, %33 : i64
%65 = llvm.add %62, %39 : i64
%66 = llvm.icmp "eq" %64, %65 : i64
%67 = llvm.select %66, %2, %3 : i1, f32
%68 = llvm.mlir.constant(10 : index) : i64
%69 = llvm.mul %60, %68 : i64
%70 = llvm.add %59, %69 : i64
%71 = llvm.add %70, %62 : i64
%72 = llvm.getelementptr %55[%71] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
llvm.store %67, %72 : !llvm.ptr<f32>
%73 = llvm.add %62, %4 : i64
llvm.br ^bb6(%73 : i64)
^bb8: // pred: ^bb6
%74 = llvm.add %60, %4 : i64
llvm.br ^bb5(%74 : i64)
^bb9: // pred: ^bb5
%75 = llvm.add %39, %38 : i64
llvm.br ^bb3(%75 : i64)
^bb10: // pred: ^bb3
%76 = llvm.add %33, %32 : i64
llvm.br ^bb1(%76 : i64)
^bb11: // pred: ^bb1
%77 = llvm.mlir.constant(0 : i32) : i32
llvm.return %77 : i32
}
}
// -----// IR Dump After LLVMCPUSynchronizeSymbolVisibility //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
llvm.func internal @matmul_test_dispatch_2(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(4 : index) : i64
%1 = llvm.mlir.constant(32 : index) : i64
%2 = llvm.mlir.constant(0 : index) : i64
%3 = llvm.mlir.constant(10 : index) : i64
%4 = llvm.mlir.constant(0.000000e+00 : f32) : f32
%5 = llvm.mlir.constant(1.000000e+00 : f32) : f32
%6 = llvm.mlir.constant(1 : index) : i64
%7 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%8 = llvm.extractvalue %7[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%9 = llvm.mlir.constant(0 : i64) : i64
%10 = llvm.getelementptr %8[%9] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%11 = llvm.load %10 : !llvm.ptr<ptr<i8>>
%12 = llvm.getelementptr %11[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%13 = llvm.bitcast %12 : !llvm.ptr<i8> to !llvm.ptr<f32>
%14 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%15 = llvm.insertvalue %13, %14[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%16 = llvm.insertvalue %13, %15[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%17 = llvm.mlir.constant(0 : index) : i64
%18 = llvm.insertvalue %17, %16[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%19 = llvm.mlir.constant(10 : index) : i64
%20 = llvm.insertvalue %19, %18[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%21 = llvm.mlir.constant(10 : index) : i64
%22 = llvm.insertvalue %21, %20[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%23 = llvm.mlir.constant(10 : index) : i64
%24 = llvm.insertvalue %23, %22[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%25 = llvm.mlir.constant(1 : index) : i64
%26 = llvm.insertvalue %25, %24[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%27 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%28 = llvm.extractvalue %27[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%29 = llvm.mlir.constant(1 : i64) : i64
%30 = llvm.getelementptr %28[%29] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.getelementptr %31[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%33 = llvm.bitcast %32 : !llvm.ptr<i8> to !llvm.ptr<f32>
%34 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%35 = llvm.insertvalue %33, %34[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%36 = llvm.insertvalue %33, %35[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%37 = llvm.mlir.constant(0 : index) : i64
%38 = llvm.insertvalue %37, %36[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%39 = llvm.mlir.constant(10 : index) : i64
%40 = llvm.insertvalue %39, %38[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%41 = llvm.mlir.constant(10 : index) : i64
%42 = llvm.insertvalue %41, %40[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%43 = llvm.mlir.constant(10 : index) : i64
%44 = llvm.insertvalue %43, %42[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%45 = llvm.mlir.constant(1 : index) : i64
%46 = llvm.insertvalue %45, %44[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%47 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%48 = llvm.extractvalue %47[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%49 = llvm.mlir.constant(2 : i64) : i64
%50 = llvm.getelementptr %48[%49] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%51 = llvm.load %50 : !llvm.ptr<ptr<i8>>
%52 = llvm.getelementptr %51[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%53 = llvm.bitcast %52 : !llvm.ptr<i8> to !llvm.ptr<f32>
%54 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%55 = llvm.insertvalue %53, %54[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%56 = llvm.insertvalue %53, %55[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%57 = llvm.mlir.constant(0 : index) : i64
%58 = llvm.insertvalue %57, %56[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%59 = llvm.mlir.constant(10 : index) : i64
%60 = llvm.insertvalue %59, %58[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%61 = llvm.mlir.constant(10 : index) : i64
%62 = llvm.insertvalue %61, %60[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%63 = llvm.mlir.constant(10 : index) : i64
%64 = llvm.insertvalue %63, %62[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%65 = llvm.mlir.constant(1 : index) : i64
%66 = llvm.insertvalue %65, %64[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%67 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%68 = llvm.extractvalue %67[0] : !llvm.array<3 x i32>
%69 = llvm.zext %68 : i32 to i64
%70 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%71 = llvm.extractvalue %70[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%72 = llvm.extractvalue %71[0] : !llvm.array<3 x i32>
%73 = llvm.zext %72 : i32 to i64
%74 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%75 = llvm.extractvalue %74[1] : !llvm.array<3 x i32>
%76 = llvm.zext %75 : i32 to i64
%77 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%78 = llvm.extractvalue %77[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%79 = llvm.extractvalue %78[1] : !llvm.array<3 x i32>
%80 = llvm.zext %79 : i32 to i64
%81 = llvm.mlir.constant(64 : index) : i64
%82 = llvm.mul %76, %81 : i64
%83 = llvm.mlir.constant(64 : index) : i64
%84 = llvm.mul %80, %83 : i64
llvm.br ^bb1(%82 : i64)
^bb1(%85: i64): // 2 preds: ^bb0, ^bb28
%86 = llvm.icmp "slt" %85, %3 : i64
llvm.cond_br %86, ^bb2, ^bb29
^bb2: // pred: ^bb1
%87 = llvm.mlir.constant(64 : index) : i64
%88 = llvm.mul %69, %87 : i64
%89 = llvm.mlir.constant(64 : index) : i64
%90 = llvm.mul %73, %89 : i64
llvm.br ^bb3(%88 : i64)
^bb3(%91: i64): // 2 preds: ^bb2, ^bb27
%92 = llvm.icmp "slt" %91, %3 : i64
llvm.cond_br %92, ^bb4, ^bb28
^bb4: // pred: ^bb3
%93 = llvm.mlir.constant(64 : index) : i64
%94 = llvm.mlir.constant(-1 : index) : i64
%95 = llvm.mul %85, %94 : i64
%96 = llvm.mlir.constant(10 : index) : i64
%97 = llvm.add %95, %96 : i64
%98 = llvm.icmp "slt" %93, %97 : i64
%99 = llvm.select %98, %93, %97 : i1, i64
%100 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%101 = llvm.extractvalue %26[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%102 = llvm.bitcast %101 : !llvm.ptr<f32> to !llvm.ptr<f32>
%103 = llvm.insertvalue %102, %100[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%104 = llvm.extractvalue %26[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%105 = llvm.bitcast %104 : !llvm.ptr<f32> to !llvm.ptr<f32>
%106 = llvm.insertvalue %105, %103[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%107 = llvm.extractvalue %26[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%108 = llvm.extractvalue %26[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%109 = llvm.extractvalue %26[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%110 = llvm.mul %85, %107 : i64
%111 = llvm.add %109, %110 : i64
%112 = llvm.mlir.constant(0 : i64) : i64
%113 = llvm.mul %112, %108 : i64
%114 = llvm.add %111, %113 : i64
%115 = llvm.insertvalue %114, %106[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%116 = llvm.mlir.constant(10 : i64) : i64
%117 = llvm.mlir.constant(1 : i64) : i64
%118 = llvm.insertvalue %116, %115[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%119 = llvm.insertvalue %117, %118[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%120 = llvm.mlir.constant(10 : i64) : i64
%121 = llvm.insertvalue %99, %119[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%122 = llvm.insertvalue %120, %121[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%123 = llvm.mlir.constant(64 : index) : i64
%124 = llvm.mlir.constant(-1 : index) : i64
%125 = llvm.mul %91, %124 : i64
%126 = llvm.mlir.constant(10 : index) : i64
%127 = llvm.add %125, %126 : i64
%128 = llvm.icmp "slt" %123, %127 : i64
%129 = llvm.select %128, %123, %127 : i1, i64
%130 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%131 = llvm.extractvalue %46[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%132 = llvm.bitcast %131 : !llvm.ptr<f32> to !llvm.ptr<f32>
%133 = llvm.insertvalue %132, %130[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%134 = llvm.extractvalue %46[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%135 = llvm.bitcast %134 : !llvm.ptr<f32> to !llvm.ptr<f32>
%136 = llvm.insertvalue %135, %133[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%137 = llvm.extractvalue %46[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%138 = llvm.extractvalue %46[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%139 = llvm.extractvalue %46[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%140 = llvm.mlir.constant(0 : i64) : i64
%141 = llvm.mul %140, %137 : i64
%142 = llvm.add %139, %141 : i64
%143 = llvm.mul %91, %138 : i64
%144 = llvm.add %142, %143 : i64
%145 = llvm.insertvalue %144, %136[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%146 = llvm.mlir.constant(1 : i64) : i64
%147 = llvm.insertvalue %129, %145[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%148 = llvm.insertvalue %146, %147[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%149 = llvm.mlir.constant(10 : i64) : i64
%150 = llvm.mlir.constant(10 : i64) : i64
%151 = llvm.insertvalue %149, %148[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%152 = llvm.insertvalue %150, %151[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%153 = llvm.mlir.constant(-1 : index) : i64
%154 = llvm.mul %85, %153 : i64
%155 = llvm.mlir.constant(10 : index) : i64
%156 = llvm.add %154, %155 : i64
%157 = llvm.mlir.constant(64 : index) : i64
%158 = llvm.icmp "slt" %156, %157 : i64
%159 = llvm.select %158, %156, %157 : i1, i64
%160 = llvm.mlir.constant(-1 : index) : i64
%161 = llvm.mul %91, %160 : i64
%162 = llvm.mlir.constant(10 : index) : i64
%163 = llvm.add %161, %162 : i64
%164 = llvm.mlir.constant(64 : index) : i64
%165 = llvm.icmp "slt" %163, %164 : i64
%166 = llvm.select %165, %163, %164 : i1, i64
%167 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%168 = llvm.extractvalue %66[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%169 = llvm.bitcast %168 : !llvm.ptr<f32> to !llvm.ptr<f32>
%170 = llvm.insertvalue %169, %167[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%171 = llvm.extractvalue %66[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%172 = llvm.bitcast %171 : !llvm.ptr<f32> to !llvm.ptr<f32>
%173 = llvm.insertvalue %172, %170[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%174 = llvm.extractvalue %66[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%175 = llvm.extractvalue %66[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%176 = llvm.extractvalue %66[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%177 = llvm.mul %85, %174 : i64
%178 = llvm.add %176, %177 : i64
%179 = llvm.mul %91, %175 : i64
%180 = llvm.add %178, %179 : i64
%181 = llvm.insertvalue %180, %173[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%182 = llvm.mlir.constant(1 : i64) : i64
%183 = llvm.insertvalue %129, %181[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%184 = llvm.insertvalue %182, %183[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%185 = llvm.mlir.constant(10 : i64) : i64
%186 = llvm.insertvalue %99, %184[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%187 = llvm.insertvalue %185, %186[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
llvm.br ^bb5(%2 : i64)
^bb5(%188: i64): // 2 preds: ^bb4, ^bb8
%189 = llvm.icmp "slt" %188, %159 : i64
llvm.cond_br %189, ^bb6(%2 : i64), ^bb9(%2 : i64)
^bb6(%190: i64): // 2 preds: ^bb5, ^bb7
%191 = llvm.icmp "slt" %190, %166 : i64
llvm.cond_br %191, ^bb7, ^bb8
^bb7: // pred: ^bb6
%192 = llvm.add %188, %85 : i64
%193 = llvm.add %190, %91 : i64
%194 = llvm.icmp "eq" %192, %193 : i64
%195 = llvm.select %194, %4, %5 : i1, f32
%196 = llvm.extractvalue %187[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%197 = llvm.extractvalue %187[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%198 = llvm.mlir.constant(10 : index) : i64
%199 = llvm.mul %188, %198 : i64
%200 = llvm.add %197, %199 : i64
%201 = llvm.add %200, %190 : i64
%202 = llvm.getelementptr %196[%201] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
llvm.store %195, %202 : !llvm.ptr<f32>
%203 = llvm.add %190, %6 : i64
llvm.br ^bb6(%203 : i64)
^bb8: // pred: ^bb6
%204 = llvm.add %188, %6 : i64
llvm.br ^bb5(%204 : i64)
^bb9(%205: i64): // 2 preds: ^bb5, ^bb26
%206 = llvm.icmp "slt" %205, %99 : i64
llvm.cond_br %206, ^bb10(%2 : i64), ^bb27
^bb10(%207: i64): // 2 preds: ^bb9, ^bb25
%208 = llvm.icmp "slt" %207, %129 : i64
llvm.cond_br %208, ^bb11, ^bb26
^bb11: // pred: ^bb10
%209 = llvm.mlir.constant(32 : index) : i64
%210 = llvm.mlir.constant(-1 : index) : i64
%211 = llvm.mul %205, %210 : i64
%212 = llvm.add %99, %211 : i64
%213 = llvm.icmp "slt" %209, %212 : i64
%214 = llvm.select %213, %209, %212 : i1, i64
%215 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%216 = llvm.extractvalue %122[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%217 = llvm.bitcast %216 : !llvm.ptr<f32> to !llvm.ptr<f32>
%218 = llvm.insertvalue %217, %215[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%219 = llvm.extractvalue %122[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%220 = llvm.bitcast %219 : !llvm.ptr<f32> to !llvm.ptr<f32>
%221 = llvm.insertvalue %220, %218[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%222 = llvm.extractvalue %122[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%223 = llvm.extractvalue %122[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%224 = llvm.extractvalue %122[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%225 = llvm.mul %205, %222 : i64
%226 = llvm.add %224, %225 : i64
%227 = llvm.mlir.constant(0 : i64) : i64
%228 = llvm.mul %227, %223 : i64
%229 = llvm.add %226, %228 : i64
%230 = llvm.insertvalue %229, %221[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%231 = llvm.mlir.constant(10 : i64) : i64
%232 = llvm.mlir.constant(1 : i64) : i64
%233 = llvm.insertvalue %231, %230[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%234 = llvm.insertvalue %232, %233[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%235 = llvm.mlir.constant(10 : i64) : i64
%236 = llvm.insertvalue %214, %234[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%237 = llvm.insertvalue %235, %236[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%238 = llvm.mlir.constant(32 : index) : i64
%239 = llvm.mlir.constant(-1 : index) : i64
%240 = llvm.mul %207, %239 : i64
%241 = llvm.add %129, %240 : i64
%242 = llvm.icmp "slt" %238, %241 : i64
%243 = llvm.select %242, %238, %241 : i1, i64
%244 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%245 = llvm.extractvalue %152[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%246 = llvm.bitcast %245 : !llvm.ptr<f32> to !llvm.ptr<f32>
%247 = llvm.insertvalue %246, %244[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%248 = llvm.extractvalue %152[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%249 = llvm.bitcast %248 : !llvm.ptr<f32> to !llvm.ptr<f32>
%250 = llvm.insertvalue %249, %247[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%251 = llvm.extractvalue %152[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%252 = llvm.extractvalue %152[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%253 = llvm.extractvalue %152[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%254 = llvm.mlir.constant(0 : i64) : i64
%255 = llvm.mul %254, %251 : i64
%256 = llvm.add %253, %255 : i64
%257 = llvm.mul %207, %252 : i64
%258 = llvm.add %256, %257 : i64
%259 = llvm.insertvalue %258, %250[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%260 = llvm.mlir.constant(1 : i64) : i64
%261 = llvm.insertvalue %243, %259[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%262 = llvm.insertvalue %260, %261[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%263 = llvm.mlir.constant(10 : i64) : i64
%264 = llvm.mlir.constant(10 : i64) : i64
%265 = llvm.insertvalue %263, %262[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%266 = llvm.insertvalue %264, %265[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%267 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%268 = llvm.extractvalue %187[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%269 = llvm.bitcast %268 : !llvm.ptr<f32> to !llvm.ptr<f32>
%270 = llvm.insertvalue %269, %267[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%271 = llvm.extractvalue %187[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%272 = llvm.bitcast %271 : !llvm.ptr<f32> to !llvm.ptr<f32>
%273 = llvm.insertvalue %272, %270[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%274 = llvm.extractvalue %187[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%275 = llvm.extractvalue %187[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%276 = llvm.extractvalue %187[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%277 = llvm.mul %205, %274 : i64
%278 = llvm.add %276, %277 : i64
%279 = llvm.mul %207, %275 : i64
%280 = llvm.add %278, %279 : i64
%281 = llvm.insertvalue %280, %273[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%282 = llvm.mlir.constant(1 : i64) : i64
%283 = llvm.insertvalue %243, %281[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%284 = llvm.insertvalue %282, %283[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%285 = llvm.mlir.constant(10 : i64) : i64
%286 = llvm.insertvalue %214, %284[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%287 = llvm.insertvalue %285, %286[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
llvm.br ^bb12(%2 : i64)
^bb12(%288: i64): // 2 preds: ^bb11, ^bb24
%289 = llvm.icmp "slt" %288, %214 : i64
llvm.cond_br %289, ^bb13(%2 : i64), ^bb25
^bb13(%290: i64): // 2 preds: ^bb12, ^bb23
%291 = llvm.icmp "slt" %290, %243 : i64
llvm.cond_br %291, ^bb14(%2 : i64), ^bb24
^bb14(%292: i64): // 2 preds: ^bb13, ^bb22
%293 = llvm.icmp "slt" %292, %3 : i64
llvm.cond_br %293, ^bb15, ^bb23
^bb15: // pred: ^bb14
%294 = llvm.mlir.constant(4 : index) : i64
%295 = llvm.mlir.constant(-1 : index) : i64
%296 = llvm.mul %288, %295 : i64
%297 = llvm.add %214, %296 : i64
%298 = llvm.icmp "slt" %294, %297 : i64
%299 = llvm.select %298, %294, %297 : i1, i64
%300 = llvm.mlir.constant(4 : index) : i64
%301 = llvm.mlir.constant(-1 : index) : i64
%302 = llvm.mul %292, %301 : i64
%303 = llvm.mlir.constant(10 : index) : i64
%304 = llvm.add %302, %303 : i64
%305 = llvm.icmp "slt" %300, %304 : i64
%306 = llvm.select %305, %300, %304 : i1, i64
%307 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%308 = llvm.extractvalue %237[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%309 = llvm.bitcast %308 : !llvm.ptr<f32> to !llvm.ptr<f32>
%310 = llvm.insertvalue %309, %307[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%311 = llvm.extractvalue %237[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%312 = llvm.bitcast %311 : !llvm.ptr<f32> to !llvm.ptr<f32>
%313 = llvm.insertvalue %312, %310[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%314 = llvm.extractvalue %237[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%315 = llvm.extractvalue %237[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%316 = llvm.extractvalue %237[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%317 = llvm.mul %288, %314 : i64
%318 = llvm.add %316, %317 : i64
%319 = llvm.mul %292, %315 : i64
%320 = llvm.add %318, %319 : i64
%321 = llvm.insertvalue %320, %313[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%322 = llvm.mlir.constant(1 : i64) : i64
%323 = llvm.insertvalue %306, %321[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%324 = llvm.insertvalue %322, %323[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%325 = llvm.mlir.constant(10 : i64) : i64
%326 = llvm.insertvalue %299, %324[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%327 = llvm.insertvalue %325, %326[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%328 = llvm.mlir.constant(4 : index) : i64
%329 = llvm.mlir.constant(-1 : index) : i64
%330 = llvm.mul %290, %329 : i64
%331 = llvm.add %243, %330 : i64
%332 = llvm.icmp "slt" %328, %331 : i64
%333 = llvm.select %332, %328, %331 : i1, i64
%334 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%335 = llvm.extractvalue %266[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%336 = llvm.bitcast %335 : !llvm.ptr<f32> to !llvm.ptr<f32>
%337 = llvm.insertvalue %336, %334[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%338 = llvm.extractvalue %266[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%339 = llvm.bitcast %338 : !llvm.ptr<f32> to !llvm.ptr<f32>
%340 = llvm.insertvalue %339, %337[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%341 = llvm.extractvalue %266[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%342 = llvm.extractvalue %266[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%343 = llvm.extractvalue %266[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%344 = llvm.mul %292, %341 : i64
%345 = llvm.add %343, %344 : i64
%346 = llvm.mul %290, %342 : i64
%347 = llvm.add %345, %346 : i64
%348 = llvm.insertvalue %347, %340[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%349 = llvm.mlir.constant(1 : i64) : i64
%350 = llvm.insertvalue %333, %348[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%351 = llvm.insertvalue %349, %350[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%352 = llvm.mlir.constant(10 : i64) : i64
%353 = llvm.insertvalue %306, %351[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%354 = llvm.insertvalue %352, %353[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%355 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%356 = llvm.extractvalue %287[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%357 = llvm.bitcast %356 : !llvm.ptr<f32> to !llvm.ptr<f32>
%358 = llvm.insertvalue %357, %355[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%359 = llvm.extractvalue %287[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%360 = llvm.bitcast %359 : !llvm.ptr<f32> to !llvm.ptr<f32>
%361 = llvm.insertvalue %360, %358[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%362 = llvm.extractvalue %287[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%363 = llvm.extractvalue %287[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%364 = llvm.extractvalue %287[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%365 = llvm.mul %288, %362 : i64
%366 = llvm.add %364, %365 : i64
%367 = llvm.mul %290, %363 : i64
%368 = llvm.add %366, %367 : i64
%369 = llvm.insertvalue %368, %361[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%370 = llvm.mlir.constant(1 : i64) : i64
%371 = llvm.insertvalue %333, %369[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%372 = llvm.insertvalue %370, %371[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%373 = llvm.mlir.constant(10 : i64) : i64
%374 = llvm.insertvalue %299, %372[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%375 = llvm.insertvalue %373, %374[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
llvm.br ^bb16(%2 : i64)
^bb16(%376: i64): // 2 preds: ^bb15, ^bb21
%377 = llvm.icmp "slt" %376, %299 : i64
llvm.cond_br %377, ^bb17(%2 : i64), ^bb22
^bb17(%378: i64): // 2 preds: ^bb16, ^bb20
%379 = llvm.icmp "slt" %378, %333 : i64
llvm.cond_br %379, ^bb18(%2 : i64), ^bb21
^bb18(%380: i64): // 2 preds: ^bb17, ^bb19
%381 = llvm.icmp "slt" %380, %306 : i64
llvm.cond_br %381, ^bb19, ^bb20
^bb19: // pred: ^bb18
%382 = llvm.extractvalue %327[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%383 = llvm.extractvalue %327[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%384 = llvm.mlir.constant(10 : index) : i64
%385 = llvm.mul %376, %384 : i64
%386 = llvm.add %383, %385 : i64
%387 = llvm.add %386, %380 : i64
%388 = llvm.getelementptr %382[%387] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%389 = llvm.load %388 : !llvm.ptr<f32>
%390 = llvm.extractvalue %354[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%391 = llvm.extractvalue %354[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%392 = llvm.mlir.constant(10 : index) : i64
%393 = llvm.mul %380, %392 : i64
%394 = llvm.add %391, %393 : i64
%395 = llvm.add %394, %378 : i64
%396 = llvm.getelementptr %390[%395] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%397 = llvm.load %396 : !llvm.ptr<f32>
%398 = llvm.extractvalue %375[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%399 = llvm.extractvalue %375[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%400 = llvm.mlir.constant(10 : index) : i64
%401 = llvm.mul %376, %400 : i64
%402 = llvm.add %399, %401 : i64
%403 = llvm.add %402, %378 : i64
%404 = llvm.getelementptr %398[%403] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%405 = llvm.load %404 : !llvm.ptr<f32>
%406 = llvm.fmul %389, %397 : f32
%407 = llvm.fadd %405, %406 : f32
%408 = llvm.extractvalue %375[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%409 = llvm.extractvalue %375[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
%410 = llvm.mlir.constant(10 : index) : i64
%411 = llvm.mul %376, %410 : i64
%412 = llvm.add %409, %411 : i64
%413 = llvm.add %412, %378 : i64
%414 = llvm.getelementptr %408[%413] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
llvm.store %407, %414 : !llvm.ptr<f32>
%415 = llvm.add %380, %6 : i64
llvm.br ^bb18(%415 : i64)
^bb20: // pred: ^bb18
%416 = llvm.add %378, %6 : i64
llvm.br ^bb17(%416 : i64)
^bb21: // pred: ^bb17
%417 = llvm.add %376, %6 : i64
llvm.br ^bb16(%417 : i64)
^bb22: // pred: ^bb16
%418 = llvm.add %292, %0 : i64
llvm.br ^bb14(%418 : i64)
^bb23: // pred: ^bb14
%419 = llvm.add %290, %0 : i64
llvm.br ^bb13(%419 : i64)
^bb24: // pred: ^bb13
%420 = llvm.add %288, %0 : i64
llvm.br ^bb12(%420 : i64)
^bb25: // pred: ^bb12
%421 = llvm.add %207, %1 : i64
llvm.br ^bb10(%421 : i64)
^bb26: // pred: ^bb10
%422 = llvm.add %205, %1 : i64
llvm.br ^bb9(%422 : i64)
^bb27: // pred: ^bb9
%423 = llvm.add %91, %90 : i64
llvm.br ^bb3(%423 : i64)
^bb28: // pred: ^bb3
%424 = llvm.add %85, %84 : i64
llvm.br ^bb1(%424 : i64)
^bb29: // pred: ^bb1
%425 = llvm.mlir.constant(0 : i32) : i32
llvm.return %425 : i32
}
}
// -----// IR Dump After CSE //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
llvm.func internal @matmul_test_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : index) : i64
%1 = llvm.mlir.constant(10 : index) : i64
%2 = llvm.mlir.constant(0.000000e+00 : f32) : f32
%3 = llvm.mlir.constant(1.000000e+00 : f32) : f32
%4 = llvm.mlir.constant(1 : index) : i64
%5 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.mlir.constant(0 : i64) : i64
%8 = llvm.getelementptr %6[%7] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%9 = llvm.load %8 : !llvm.ptr<ptr<i8>>
%10 = llvm.getelementptr %9[%0] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%11 = llvm.bitcast %10 : !llvm.ptr<i8> to !llvm.ptr<f32>
%12 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%13 = llvm.extractvalue %12[0] : !llvm.array<3 x i32>
%14 = llvm.zext %13 : i32 to i64
%15 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%16 = llvm.extractvalue %15[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%17 = llvm.extractvalue %16[0] : !llvm.array<3 x i32>
%18 = llvm.zext %17 : i32 to i64
%19 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%20 = llvm.extractvalue %19[1] : !llvm.array<3 x i32>
%21 = llvm.zext %20 : i32 to i64
%22 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%23 = llvm.extractvalue %22[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%24 = llvm.extractvalue %23[1] : !llvm.array<3 x i32>
%25 = llvm.zext %24 : i32 to i64
%26 = llvm.mlir.constant(64 : index) : i64
%27 = llvm.mul %21, %26 : i64
%28 = llvm.mul %25, %26 : i64
llvm.br ^bb1(%27 : i64)
^bb1(%29: i64): // 2 preds: ^bb0, ^bb10
%30 = llvm.icmp "slt" %29, %1 : i64
llvm.cond_br %30, ^bb2, ^bb11
^bb2: // pred: ^bb1
%31 = llvm.mul %14, %26 : i64
%32 = llvm.mul %18, %26 : i64
llvm.br ^bb3(%31 : i64)
^bb3(%33: i64): // 2 preds: ^bb2, ^bb9
%34 = llvm.icmp "slt" %33, %1 : i64
llvm.cond_br %34, ^bb4, ^bb10
^bb4: // pred: ^bb3
%35 = llvm.mlir.constant(-1 : index) : i64
%36 = llvm.mul %29, %35 : i64
%37 = llvm.add %36, %1 : i64
%38 = llvm.icmp "slt" %26, %37 : i64
%39 = llvm.select %38, %26, %37 : i1, i64
%40 = llvm.mul %33, %35 : i64
%41 = llvm.add %40, %1 : i64
%42 = llvm.icmp "slt" %26, %41 : i64
%43 = llvm.select %42, %26, %41 : i1, i64
%44 = llvm.bitcast %11 : !llvm.ptr<f32> to !llvm.ptr<f32>
%45 = llvm.mul %29, %1 : i64
%46 = llvm.add %0, %45 : i64
%47 = llvm.mul %33, %4 : i64
%48 = llvm.add %46, %47 : i64
llvm.br ^bb5(%0 : i64)
^bb5(%49: i64): // 2 preds: ^bb4, ^bb8
%50 = llvm.icmp "slt" %49, %39 : i64
llvm.cond_br %50, ^bb6(%0 : i64), ^bb9
^bb6(%51: i64): // 2 preds: ^bb5, ^bb7
%52 = llvm.icmp "slt" %51, %43 : i64
llvm.cond_br %52, ^bb7, ^bb8
^bb7: // pred: ^bb6
%53 = llvm.add %49, %29 : i64
%54 = llvm.add %51, %33 : i64
%55 = llvm.icmp "eq" %53, %54 : i64
%56 = llvm.select %55, %2, %3 : i1, f32
%57 = llvm.mul %49, %1 : i64
%58 = llvm.add %48, %57 : i64
%59 = llvm.add %58, %51 : i64
%60 = llvm.getelementptr %44[%59] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
llvm.store %56, %60 : !llvm.ptr<f32>
%61 = llvm.add %51, %4 : i64
llvm.br ^bb6(%61 : i64)
^bb8: // pred: ^bb6
%62 = llvm.add %49, %4 : i64
llvm.br ^bb5(%62 : i64)
^bb9: // pred: ^bb5
%63 = llvm.add %33, %32 : i64
llvm.br ^bb3(%63 : i64)
^bb10: // pred: ^bb3
%64 = llvm.add %29, %28 : i64
llvm.br ^bb1(%64 : i64)
^bb11: // pred: ^bb1
%65 = llvm.mlir.constant(0 : i32) : i32
llvm.return %65 : i32
}
}
// -----// IR Dump After Canonicalizer //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
llvm.func internal @matmul_test_dispatch_2(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(4 : index) : i64
%1 = llvm.mlir.constant(32 : index) : i64
%2 = llvm.mlir.constant(0 : index) : i64
%3 = llvm.mlir.constant(10 : index) : i64
%4 = llvm.mlir.constant(0.000000e+00 : f32) : f32
%5 = llvm.mlir.constant(1.000000e+00 : f32) : f32
%6 = llvm.mlir.constant(1 : index) : i64
%7 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%8 = llvm.extractvalue %7[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%9 = llvm.mlir.constant(0 : i64) : i64
%10 = llvm.getelementptr %8[%9] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%11 = llvm.load %10 : !llvm.ptr<ptr<i8>>
%12 = llvm.getelementptr %11[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%13 = llvm.bitcast %12 : !llvm.ptr<i8> to !llvm.ptr<f32>
%14 = llvm.mlir.constant(0 : index) : i64
%15 = llvm.mlir.constant(10 : index) : i64
%16 = llvm.mlir.constant(1 : index) : i64
%17 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%18 = llvm.extractvalue %17[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%19 = llvm.mlir.constant(1 : i64) : i64
%20 = llvm.getelementptr %18[%19] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%21 = llvm.load %20 : !llvm.ptr<ptr<i8>>
%22 = llvm.getelementptr %21[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%23 = llvm.bitcast %22 : !llvm.ptr<i8> to !llvm.ptr<f32>
%24 = llvm.mlir.constant(0 : index) : i64
%25 = llvm.mlir.constant(10 : index) : i64
%26 = llvm.mlir.constant(1 : index) : i64
%27 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%28 = llvm.extractvalue %27[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%29 = llvm.mlir.constant(2 : i64) : i64
%30 = llvm.getelementptr %28[%29] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%31 = llvm.load %30 : !llvm.ptr<ptr<i8>>
%32 = llvm.getelementptr %31[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%33 = llvm.bitcast %32 : !llvm.ptr<i8> to !llvm.ptr<f32>
%34 = llvm.mlir.constant(0 : index) : i64
%35 = llvm.mlir.constant(10 : index) : i64
%36 = llvm.mlir.constant(1 : index) : i64
%37 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%38 = llvm.extractvalue %37[0] : !llvm.array<3 x i32>
%39 = llvm.zext %38 : i32 to i64
%40 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%41 = llvm.extractvalue %40[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%42 = llvm.extractvalue %41[0] : !llvm.array<3 x i32>
%43 = llvm.zext %42 : i32 to i64
%44 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%45 = llvm.extractvalue %44[1] : !llvm.array<3 x i32>
%46 = llvm.zext %45 : i32 to i64
%47 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%48 = llvm.extractvalue %47[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%49 = llvm.extractvalue %48[1] : !llvm.array<3 x i32>
%50 = llvm.zext %49 : i32 to i64
%51 = llvm.mlir.constant(64 : index) : i64
%52 = llvm.mul %46, %51 : i64
%53 = llvm.mlir.constant(64 : index) : i64
%54 = llvm.mul %50, %53 : i64
llvm.br ^bb1(%52 : i64)
^bb1(%55: i64): // 2 preds: ^bb0, ^bb28
%56 = llvm.icmp "slt" %55, %3 : i64
llvm.cond_br %56, ^bb2, ^bb29
^bb2: // pred: ^bb1
%57 = llvm.mlir.constant(64 : index) : i64
%58 = llvm.mul %39, %57 : i64
%59 = llvm.mlir.constant(64 : index) : i64
%60 = llvm.mul %43, %59 : i64
llvm.br ^bb3(%58 : i64)
^bb3(%61: i64): // 2 preds: ^bb2, ^bb27
%62 = llvm.icmp "slt" %61, %3 : i64
llvm.cond_br %62, ^bb4, ^bb28
^bb4: // pred: ^bb3
%63 = llvm.mlir.constant(64 : index) : i64
%64 = llvm.mlir.constant(-1 : index) : i64
%65 = llvm.mul %55, %64 : i64
%66 = llvm.mlir.constant(10 : index) : i64
%67 = llvm.add %65, %66 : i64
%68 = llvm.icmp "slt" %63, %67 : i64
%69 = llvm.select %68, %63, %67 : i1, i64
%70 = llvm.bitcast %13 : !llvm.ptr<f32> to !llvm.ptr<f32>
%71 = llvm.mul %55, %15 : i64
%72 = llvm.add %14, %71 : i64
%73 = llvm.mlir.constant(0 : i64) : i64
%74 = llvm.mul %73, %16 : i64
%75 = llvm.add %72, %74 : i64
%76 = llvm.mlir.constant(1 : i64) : i64
%77 = llvm.mlir.constant(10 : i64) : i64
%78 = llvm.mlir.constant(64 : index) : i64
%79 = llvm.mlir.constant(-1 : index) : i64
%80 = llvm.mul %61, %79 : i64
%81 = llvm.mlir.constant(10 : index) : i64
%82 = llvm.add %80, %81 : i64
%83 = llvm.icmp "slt" %78, %82 : i64
%84 = llvm.select %83, %78, %82 : i1, i64
%85 = llvm.bitcast %23 : !llvm.ptr<f32> to !llvm.ptr<f32>
%86 = llvm.mlir.constant(0 : i64) : i64
%87 = llvm.mul %86, %25 : i64
%88 = llvm.add %24, %87 : i64
%89 = llvm.mul %61, %26 : i64
%90 = llvm.add %88, %89 : i64
%91 = llvm.mlir.constant(1 : i64) : i64
%92 = llvm.mlir.constant(10 : i64) : i64
%93 = llvm.mlir.constant(-1 : index) : i64
%94 = llvm.mul %55, %93 : i64
%95 = llvm.mlir.constant(10 : index) : i64
%96 = llvm.add %94, %95 : i64
%97 = llvm.mlir.constant(64 : index) : i64
%98 = llvm.icmp "slt" %96, %97 : i64
%99 = llvm.select %98, %96, %97 : i1, i64
%100 = llvm.mlir.constant(-1 : index) : i64
%101 = llvm.mul %61, %100 : i64
%102 = llvm.mlir.constant(10 : index) : i64
%103 = llvm.add %101, %102 : i64
%104 = llvm.mlir.constant(64 : index) : i64
%105 = llvm.icmp "slt" %103, %104 : i64
%106 = llvm.select %105, %103, %104 : i1, i64
%107 = llvm.bitcast %33 : !llvm.ptr<f32> to !llvm.ptr<f32>
%108 = llvm.mul %55, %35 : i64
%109 = llvm.add %34, %108 : i64
%110 = llvm.mul %61, %36 : i64
%111 = llvm.add %109, %110 : i64
%112 = llvm.mlir.constant(1 : i64) : i64
%113 = llvm.mlir.constant(10 : i64) : i64
llvm.br ^bb5(%2 : i64)
^bb5(%114: i64): // 2 preds: ^bb4, ^bb8
%115 = llvm.icmp "slt" %114, %99 : i64
llvm.cond_br %115, ^bb6(%2 : i64), ^bb9(%2 : i64)
^bb6(%116: i64): // 2 preds: ^bb5, ^bb7
%117 = llvm.icmp "slt" %116, %106 : i64
llvm.cond_br %117, ^bb7, ^bb8
^bb7: // pred: ^bb6
%118 = llvm.add %114, %55 : i64
%119 = llvm.add %116, %61 : i64
%120 = llvm.icmp "eq" %118, %119 : i64
%121 = llvm.select %120, %4, %5 : i1, f32
%122 = llvm.mlir.constant(10 : index) : i64
%123 = llvm.mul %114, %122 : i64
%124 = llvm.add %111, %123 : i64
%125 = llvm.add %124, %116 : i64
%126 = llvm.getelementptr %107[%125] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
llvm.store %121, %126 : !llvm.ptr<f32>
%127 = llvm.add %116, %6 : i64
llvm.br ^bb6(%127 : i64)
^bb8: // pred: ^bb6
%128 = llvm.add %114, %6 : i64
llvm.br ^bb5(%128 : i64)
^bb9(%129: i64): // 2 preds: ^bb5, ^bb26
%130 = llvm.icmp "slt" %129, %69 : i64
llvm.cond_br %130, ^bb10(%2 : i64), ^bb27
^bb10(%131: i64): // 2 preds: ^bb9, ^bb25
%132 = llvm.icmp "slt" %131, %84 : i64
llvm.cond_br %132, ^bb11, ^bb26
^bb11: // pred: ^bb10
%133 = llvm.mlir.constant(32 : index) : i64
%134 = llvm.mlir.constant(-1 : index) : i64
%135 = llvm.mul %129, %134 : i64
%136 = llvm.add %69, %135 : i64
%137 = llvm.icmp "slt" %133, %136 : i64
%138 = llvm.select %137, %133, %136 : i1, i64
%139 = llvm.bitcast %70 : !llvm.ptr<f32> to !llvm.ptr<f32>
%140 = llvm.mul %129, %77 : i64
%141 = llvm.add %75, %140 : i64
%142 = llvm.mlir.constant(0 : i64) : i64
%143 = llvm.mul %142, %76 : i64
%144 = llvm.add %141, %143 : i64
%145 = llvm.mlir.constant(1 : i64) : i64
%146 = llvm.mlir.constant(10 : i64) : i64
%147 = llvm.mlir.constant(32 : index) : i64
%148 = llvm.mlir.constant(-1 : index) : i64
%149 = llvm.mul %131, %148 : i64
%150 = llvm.add %84, %149 : i64
%151 = llvm.icmp "slt" %147, %150 : i64
%152 = llvm.select %151, %147, %150 : i1, i64
%153 = llvm.bitcast %85 : !llvm.ptr<f32> to !llvm.ptr<f32>
%154 = llvm.mlir.constant(0 : i64) : i64
%155 = llvm.mul %154, %92 : i64
%156 = llvm.add %90, %155 : i64
%157 = llvm.mul %131, %91 : i64
%158 = llvm.add %156, %157 : i64
%159 = llvm.mlir.constant(1 : i64) : i64
%160 = llvm.mlir.constant(10 : i64) : i64
%161 = llvm.bitcast %107 : !llvm.ptr<f32> to !llvm.ptr<f32>
%162 = llvm.mul %129, %113 : i64
%163 = llvm.add %111, %162 : i64
%164 = llvm.mul %131, %112 : i64
%165 = llvm.add %163, %164 : i64
%166 = llvm.mlir.constant(1 : i64) : i64
%167 = llvm.mlir.constant(10 : i64) : i64
llvm.br ^bb12(%2 : i64)
^bb12(%168: i64): // 2 preds: ^bb11, ^bb24
%169 = llvm.icmp "slt" %168, %138 : i64
llvm.cond_br %169, ^bb13(%2 : i64), ^bb25
^bb13(%170: i64): // 2 preds: ^bb12, ^bb23
%171 = llvm.icmp "slt" %170, %152 : i64
llvm.cond_br %171, ^bb14(%2 : i64), ^bb24
^bb14(%172: i64): // 2 preds: ^bb13, ^bb22
%173 = llvm.icmp "slt" %172, %3 : i64
llvm.cond_br %173, ^bb15, ^bb23
^bb15: // pred: ^bb14
%174 = llvm.mlir.constant(4 : index) : i64
%175 = llvm.mlir.constant(-1 : index) : i64
%176 = llvm.mul %168, %175 : i64
%177 = llvm.add %138, %176 : i64
%178 = llvm.icmp "slt" %174, %177 : i64
%179 = llvm.select %178, %174, %177 : i1, i64
%180 = llvm.mlir.constant(4 : index) : i64
%181 = llvm.mlir.constant(-1 : index) : i64
%182 = llvm.mul %172, %181 : i64
%183 = llvm.mlir.constant(10 : index) : i64
%184 = llvm.add %182, %183 : i64
%185 = llvm.icmp "slt" %180, %184 : i64
%186 = llvm.select %185, %180, %184 : i1, i64
%187 = llvm.bitcast %139 : !llvm.ptr<f32> to !llvm.ptr<f32>
%188 = llvm.mul %168, %146 : i64
%189 = llvm.add %144, %188 : i64
%190 = llvm.mul %172, %145 : i64
%191 = llvm.add %189, %190 : i64
%192 = llvm.mlir.constant(4 : index) : i64
%193 = llvm.mlir.constant(-1 : index) : i64
%194 = llvm.mul %170, %193 : i64
%195 = llvm.add %152, %194 : i64
%196 = llvm.icmp "slt" %192, %195 : i64
%197 = llvm.select %196, %192, %195 : i1, i64
%198 = llvm.bitcast %153 : !llvm.ptr<f32> to !llvm.ptr<f32>
%199 = llvm.mul %172, %160 : i64
%200 = llvm.add %158, %199 : i64
%201 = llvm.mul %170, %159 : i64
%202 = llvm.add %200, %201 : i64
%203 = llvm.bitcast %161 : !llvm.ptr<f32> to !llvm.ptr<f32>
%204 = llvm.mul %168, %167 : i64
%205 = llvm.add %165, %204 : i64
%206 = llvm.mul %170, %166 : i64
%207 = llvm.add %205, %206 : i64
llvm.br ^bb16(%2 : i64)
^bb16(%208: i64): // 2 preds: ^bb15, ^bb21
%209 = llvm.icmp "slt" %208, %179 : i64
llvm.cond_br %209, ^bb17(%2 : i64), ^bb22
^bb17(%210: i64): // 2 preds: ^bb16, ^bb20
%211 = llvm.icmp "slt" %210, %197 : i64
llvm.cond_br %211, ^bb18(%2 : i64), ^bb21
^bb18(%212: i64): // 2 preds: ^bb17, ^bb19
%213 = llvm.icmp "slt" %212, %186 : i64
llvm.cond_br %213, ^bb19, ^bb20
^bb19: // pred: ^bb18
%214 = llvm.mlir.constant(10 : index) : i64
%215 = llvm.mul %208, %214 : i64
%216 = llvm.add %191, %215 : i64
%217 = llvm.add %216, %212 : i64
%218 = llvm.getelementptr %187[%217] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%219 = llvm.load %218 : !llvm.ptr<f32>
%220 = llvm.mlir.constant(10 : index) : i64
%221 = llvm.mul %212, %220 : i64
%222 = llvm.add %202, %221 : i64
%223 = llvm.add %222, %210 : i64
%224 = llvm.getelementptr %198[%223] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%225 = llvm.load %224 : !llvm.ptr<f32>
%226 = llvm.mlir.constant(10 : index) : i64
%227 = llvm.mul %208, %226 : i64
%228 = llvm.add %207, %227 : i64
%229 = llvm.add %228, %210 : i64
%230 = llvm.getelementptr %203[%229] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%231 = llvm.load %230 : !llvm.ptr<f32>
%232 = llvm.fmul %219, %225 : f32
%233 = llvm.fadd %231, %232 : f32
%234 = llvm.mlir.constant(10 : index) : i64
%235 = llvm.mul %208, %234 : i64
%236 = llvm.add %207, %235 : i64
%237 = llvm.add %236, %210 : i64
%238 = llvm.getelementptr %203[%237] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
llvm.store %233, %238 : !llvm.ptr<f32>
%239 = llvm.add %212, %6 : i64
llvm.br ^bb18(%239 : i64)
^bb20: // pred: ^bb18
%240 = llvm.add %210, %6 : i64
llvm.br ^bb17(%240 : i64)
^bb21: // pred: ^bb17
%241 = llvm.add %208, %6 : i64
llvm.br ^bb16(%241 : i64)
^bb22: // pred: ^bb16
%242 = llvm.add %172, %0 : i64
llvm.br ^bb14(%242 : i64)
^bb23: // pred: ^bb14
%243 = llvm.add %170, %0 : i64
llvm.br ^bb13(%243 : i64)
^bb24: // pred: ^bb13
%244 = llvm.add %168, %0 : i64
llvm.br ^bb12(%244 : i64)
^bb25: // pred: ^bb12
%245 = llvm.add %131, %1 : i64
llvm.br ^bb10(%245 : i64)
^bb26: // pred: ^bb10
%246 = llvm.add %129, %1 : i64
llvm.br ^bb9(%246 : i64)
^bb27: // pred: ^bb9
%247 = llvm.add %61, %60 : i64
llvm.br ^bb3(%247 : i64)
^bb28: // pred: ^bb3
%248 = llvm.add %55, %54 : i64
llvm.br ^bb1(%248 : i64)
^bb29: // pred: ^bb1
%249 = llvm.mlir.constant(0 : i32) : i32
llvm.return %249 : i32
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateTargetExecutableVariantsPass //----- //
hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> {
hal.executable.entry_point public @matmul_test_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg0]
%1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1]
hal.return %0, %1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
llvm.func internal @matmul_test_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : index) : i64
%1 = llvm.mlir.constant(10 : index) : i64
%2 = llvm.mlir.constant(0.000000e+00 : f32) : f32
%3 = llvm.mlir.constant(1.000000e+00 : f32) : f32
%4 = llvm.mlir.constant(1 : index) : i64
%5 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.mlir.constant(0 : i64) : i64
%8 = llvm.getelementptr %6[%7] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%9 = llvm.load %8 : !llvm.ptr<ptr<i8>>
%10 = llvm.getelementptr %9[%0] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%11 = llvm.bitcast %10 : !llvm.ptr<i8> to !llvm.ptr<f32>
%12 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%13 = llvm.extractvalue %12[0] : !llvm.array<3 x i32>
%14 = llvm.zext %13 : i32 to i64
%15 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%16 = llvm.extractvalue %15[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%17 = llvm.extractvalue %16[0] : !llvm.array<3 x i32>
%18 = llvm.zext %17 : i32 to i64
%19 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%20 = llvm.extractvalue %19[1] : !llvm.array<3 x i32>
%21 = llvm.zext %20 : i32 to i64
%22 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%23 = llvm.extractvalue %22[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%24 = llvm.extractvalue %23[1] : !llvm.array<3 x i32>
%25 = llvm.zext %24 : i32 to i64
%26 = llvm.mlir.constant(64 : index) : i64
%27 = llvm.mul %21, %26 : i64
%28 = llvm.mul %25, %26 : i64
llvm.br ^bb1(%27 : i64)
^bb1(%29: i64): // 2 preds: ^bb0, ^bb10
%30 = llvm.icmp "slt" %29, %1 : i64
llvm.cond_br %30, ^bb2, ^bb11
^bb2: // pred: ^bb1
%31 = llvm.mul %14, %26 : i64
%32 = llvm.mul %18, %26 : i64
llvm.br ^bb3(%31 : i64)
^bb3(%33: i64): // 2 preds: ^bb2, ^bb9
%34 = llvm.icmp "slt" %33, %1 : i64
llvm.cond_br %34, ^bb4, ^bb10
^bb4: // pred: ^bb3
%35 = llvm.mlir.constant(-1 : index) : i64
%36 = llvm.mul %29, %35 : i64
%37 = llvm.add %36, %1 : i64
%38 = llvm.icmp "slt" %26, %37 : i64
%39 = llvm.select %38, %26, %37 : i1, i64
%40 = llvm.mul %33, %35 : i64
%41 = llvm.add %40, %1 : i64
%42 = llvm.icmp "slt" %26, %41 : i64
%43 = llvm.select %42, %26, %41 : i1, i64
%44 = llvm.bitcast %11 : !llvm.ptr<f32> to !llvm.ptr<f32>
%45 = llvm.mul %29, %1 : i64
%46 = llvm.add %0, %45 : i64
%47 = llvm.mul %33, %4 : i64
%48 = llvm.add %46, %47 : i64
llvm.br ^bb5(%0 : i64)
^bb5(%49: i64): // 2 preds: ^bb4, ^bb8
%50 = llvm.icmp "slt" %49, %39 : i64
llvm.cond_br %50, ^bb6(%0 : i64), ^bb9
^bb6(%51: i64): // 2 preds: ^bb5, ^bb7
%52 = llvm.icmp "slt" %51, %43 : i64
llvm.cond_br %52, ^bb7, ^bb8
^bb7: // pred: ^bb6
%53 = llvm.add %49, %29 : i64
%54 = llvm.add %51, %33 : i64
%55 = llvm.icmp "eq" %53, %54 : i64
%56 = llvm.select %55, %2, %3 : i1, f32
%57 = llvm.mul %49, %1 : i64
%58 = llvm.add %48, %57 : i64
%59 = llvm.add %58, %51 : i64
%60 = llvm.getelementptr %44[%59] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
llvm.store %56, %60 : !llvm.ptr<f32>
%61 = llvm.add %51, %4 : i64
llvm.br ^bb6(%61 : i64)
^bb8: // pred: ^bb6
%62 = llvm.add %49, %4 : i64
llvm.br ^bb5(%62 : i64)
^bb9: // pred: ^bb5
%63 = llvm.add %33, %32 : i64
llvm.br ^bb3(%63 : i64)
^bb10: // pred: ^bb3
%64 = llvm.add %29, %28 : i64
llvm.br ^bb1(%64 : i64)
^bb11: // pred: ^bb1
%65 = llvm.mlir.constant(0 : i32) : i32
llvm.return %65 : i32
}
}
}
// -----// IR Dump After CSE //----- //
module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
llvm.func internal @matmul_test_dispatch_2(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(4 : index) : i64
%1 = llvm.mlir.constant(32 : index) : i64
%2 = llvm.mlir.constant(0 : index) : i64
%3 = llvm.mlir.constant(10 : index) : i64
%4 = llvm.mlir.constant(0.000000e+00 : f32) : f32
%5 = llvm.mlir.constant(1.000000e+00 : f32) : f32
%6 = llvm.mlir.constant(1 : index) : i64
%7 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%8 = llvm.extractvalue %7[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%9 = llvm.mlir.constant(0 : i64) : i64
%10 = llvm.getelementptr %8[%9] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%11 = llvm.load %10 : !llvm.ptr<ptr<i8>>
%12 = llvm.getelementptr %11[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%13 = llvm.bitcast %12 : !llvm.ptr<i8> to !llvm.ptr<f32>
%14 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%15 = llvm.extractvalue %14[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%16 = llvm.mlir.constant(1 : i64) : i64
%17 = llvm.getelementptr %15[%16] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%18 = llvm.load %17 : !llvm.ptr<ptr<i8>>
%19 = llvm.getelementptr %18[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<f32>
%21 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.mlir.constant(2 : i64) : i64
%24 = llvm.getelementptr %22[%23] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%25 = llvm.load %24 : !llvm.ptr<ptr<i8>>
%26 = llvm.getelementptr %25[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%27 = llvm.bitcast %26 : !llvm.ptr<i8> to !llvm.ptr<f32>
%28 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%29 = llvm.extractvalue %28[0] : !llvm.array<3 x i32>
%30 = llvm.zext %29 : i32 to i64
%31 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%32 = llvm.extractvalue %31[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%33 = llvm.extractvalue %32[0] : !llvm.array<3 x i32>
%34 = llvm.zext %33 : i32 to i64
%35 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%36 = llvm.extractvalue %35[1] : !llvm.array<3 x i32>
%37 = llvm.zext %36 : i32 to i64
%38 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%39 = llvm.extractvalue %38[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%40 = llvm.extractvalue %39[1] : !llvm.array<3 x i32>
%41 = llvm.zext %40 : i32 to i64
%42 = llvm.mlir.constant(64 : index) : i64
%43 = llvm.mul %37, %42 : i64
%44 = llvm.mul %41, %42 : i64
llvm.br ^bb1(%43 : i64)
^bb1(%45: i64): // 2 preds: ^bb0, ^bb28
%46 = llvm.icmp "slt" %45, %3 : i64
llvm.cond_br %46, ^bb2, ^bb29
^bb2: // pred: ^bb1
%47 = llvm.mul %30, %42 : i64
%48 = llvm.mul %34, %42 : i64
llvm.br ^bb3(%47 : i64)
^bb3(%49: i64): // 2 preds: ^bb2, ^bb27
%50 = llvm.icmp "slt" %49, %3 : i64
llvm.cond_br %50, ^bb4, ^bb28
^bb4: // pred: ^bb3
%51 = llvm.mlir.constant(-1 : index) : i64
%52 = llvm.mul %45, %51 : i64
%53 = llvm.add %52, %3 : i64
%54 = llvm.icmp "slt" %42, %53 : i64
%55 = llvm.select %54, %42, %53 : i1, i64
%56 = llvm.bitcast %13 : !llvm.ptr<f32> to !llvm.ptr<f32>
%57 = llvm.mul %45, %3 : i64
%58 = llvm.add %2, %57 : i64
%59 = llvm.mul %9, %6 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mlir.constant(10 : i64) : i64
%62 = llvm.mul %49, %51 : i64
%63 = llvm.add %62, %3 : i64
%64 = llvm.icmp "slt" %42, %63 : i64
%65 = llvm.select %64, %42, %63 : i1, i64
%66 = llvm.bitcast %20 : !llvm.ptr<f32> to !llvm.ptr<f32>
%67 = llvm.mul %9, %3 : i64
%68 = llvm.add %2, %67 : i64
%69 = llvm.mul %49, %6 : i64
%70 = llvm.add %68, %69 : i64
%71 = llvm.icmp "slt" %53, %42 : i64
%72 = llvm.select %71, %53, %42 : i1, i64
%73 = llvm.icmp "slt" %63, %42 : i64
%74 = llvm.select %73, %63, %42 : i1, i64
%75 = llvm.bitcast %27 : !llvm.ptr<f32> to !llvm.ptr<f32>
%76 = llvm.add %58, %69 : i64
llvm.br ^bb5(%2 : i64)
^bb5(%77: i64): // 2 preds: ^bb4, ^bb8
%78 = llvm.icmp "slt" %77, %72 : i64
llvm.cond_br %78, ^bb6(%2 : i64), ^bb9(%2 : i64)
^bb6(%79: i64): // 2 preds: ^bb5, ^bb7
%80 = llvm.icmp "slt" %79, %74 : i64
llvm.cond_br %80, ^bb7, ^bb8
^bb7: // pred: ^bb6
%81 = llvm.add %77, %45 : i64
%82 = llvm.add %79, %49 : i64
%83 = llvm.icmp "eq" %81, %82 : i64
%84 = llvm.select %83, %4, %5 : i1, f32
%85 = llvm.mul %77, %3 : i64
%86 = llvm.add %76, %85 : i64
%87 = llvm.add %86, %79 : i64
%88 = llvm.getelementptr %75[%87] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
llvm.store %84, %88 : !llvm.ptr<f32>
%89 = llvm.add %79, %6 : i64
llvm.br ^bb6(%89 : i64)
^bb8: // pred: ^bb6
%90 = llvm.add %77, %6 : i64
llvm.br ^bb5(%90 : i64)
^bb9(%91: i64): // 2 preds: ^bb5, ^bb26
%92 = llvm.icmp "slt" %91, %55 : i64
llvm.cond_br %92, ^bb10(%2 : i64), ^bb27
^bb10(%93: i64): // 2 preds: ^bb9, ^bb25
%94 = llvm.icmp "slt" %93, %65 : i64
llvm.cond_br %94, ^bb11, ^bb26
^bb11: // pred: ^bb10
%95 = llvm.mul %91, %51 : i64
%96 = llvm.add %55, %95 : i64
%97 = llvm.icmp "slt" %1, %96 : i64
%98 = llvm.select %97, %1, %96 : i1, i64
%99 = llvm.bitcast %56 : !llvm.ptr<f32> to !llvm.ptr<f32>
%100 = llvm.mul %91, %61 : i64
%101 = llvm.add %60, %100 : i64
%102 = llvm.mul %9, %16 : i64
%103 = llvm.add %101, %102 : i64
%104 = llvm.mul %93, %51 : i64
%105 = llvm.add %65, %104 : i64
%106 = llvm.icmp "slt" %1, %105 : i64
%107 = llvm.select %106, %1, %105 : i1, i64
%108 = llvm.bitcast %66 : !llvm.ptr<f32> to !llvm.ptr<f32>
%109 = llvm.mul %9, %61 : i64
%110 = llvm.add %70, %109 : i64
%111 = llvm.mul %93, %16 : i64
%112 = llvm.add %110, %111 : i64
%113 = llvm.bitcast %75 : !llvm.ptr<f32> to !llvm.ptr<f32>
%114 = llvm.add %76, %100 : i64
%115 = llvm.add %114, %111 : i64
llvm.br ^bb12(%2 : i64)
^bb12(%116: i64): // 2 preds: ^bb11, ^bb24
%117 = llvm.icmp "slt" %116, %98 : i64
llvm.cond_br %117, ^bb13(%2 : i64), ^bb25
^bb13(%118: i64): // 2 preds: ^bb12, ^bb23
%119 = llvm.icmp "slt" %118, %107 : i64
llvm.cond_br %119, ^bb14(%2 : i64), ^bb24
^bb14(%120: i64): // 2 preds: ^bb13, ^bb22
%121 = llvm.icmp "slt" %120, %3 : i64
llvm.cond_br %121, ^bb15, ^bb23
^bb15: // pred: ^bb14
%122 = llvm.mul %116, %51 : i64
%123 = llvm.add %98, %122 : i64
%124 = llvm.icmp "slt" %0, %123 : i64
%125 = llvm.select %124, %0, %123 : i1, i64
%126 = llvm.mul %120, %51 : i64
%127 = llvm.add %126, %3 : i64
%128 = llvm.icmp "slt" %0, %127 : i64
%129 = llvm.select %128, %0, %127 : i1, i64
%130 = llvm.bitcast %99 : !llvm.ptr<f32> to !llvm.ptr<f32>
%131 = llvm.mul %116, %61 : i64
%132 = llvm.add %103, %131 : i64
%133 = llvm.mul %120, %16 : i64
%134 = llvm.add %132, %133 : i64
%135 = llvm.mul %118, %51 : i64
%136 = llvm.add %107, %135 : i64
%137 = llvm.icmp "slt" %0, %136 : i64
%138 = llvm.select %137, %0, %136 : i1, i64
%139 = llvm.bitcast %108 : !llvm.ptr<f32> to !llvm.ptr<f32>
%140 = llvm.mul %120, %61 : i64
%141 = llvm.add %112, %140 : i64
%142 = llvm.mul %118, %16 : i64
%143 = llvm.add %141, %142 : i64
%144 = llvm.bitcast %113 : !llvm.ptr<f32> to !llvm.ptr<f32>
%145 = llvm.add %115, %131 : i64
%146 = llvm.add %145, %142 : i64
llvm.br ^bb16(%2 : i64)
^bb16(%147: i64): // 2 preds: ^bb15, ^bb21
%148 = llvm.icmp "slt" %147, %125 : i64
llvm.cond_br %148, ^bb17(%2 : i64), ^bb22
^bb17(%149: i64): // 2 preds: ^bb16, ^bb20
%150 = llvm.icmp "slt" %149, %138 : i64
llvm.cond_br %150, ^bb18(%2 : i64), ^bb21
^bb18(%151: i64): // 2 preds: ^bb17, ^bb19
%152 = llvm.icmp "slt" %151, %129 : i64
llvm.cond_br %152, ^bb19, ^bb20
^bb19: // pred: ^bb18
%153 = llvm.mul %147, %3 : i64
%154 = llvm.add %134, %153 : i64
%155 = llvm.add %154, %151 : i64
%156 = llvm.getelementptr %130[%155] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%157 = llvm.load %156 : !llvm.ptr<f32>
%158 = llvm.mul %151, %3 : i64
%159 = llvm.add %143, %158 : i64
%160 = llvm.add %159, %149 : i64
%161 = llvm.getelementptr %139[%160] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%162 = llvm.load %161 : !llvm.ptr<f32>
%163 = llvm.add %146, %153 : i64
%164 = llvm.add %163, %149 : i64
%165 = llvm.getelementptr %144[%164] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%166 = llvm.load %165 : !llvm.ptr<f32>
%167 = llvm.fmul %157, %162 : f32
%168 = llvm.fadd %166, %167 : f32
llvm.store %168, %165 : !llvm.ptr<f32>
%169 = llvm.add %151, %6 : i64
llvm.br ^bb18(%169 : i64)
^bb20: // pred: ^bb18
%170 = llvm.add %149, %6 : i64
llvm.br ^bb17(%170 : i64)
^bb21: // pred: ^bb17
%171 = llvm.add %147, %6 : i64
llvm.br ^bb16(%171 : i64)
^bb22: // pred: ^bb16
%172 = llvm.add %120, %0 : i64
llvm.br ^bb14(%172 : i64)
^bb23: // pred: ^bb14
%173 = llvm.add %118, %0 : i64
llvm.br ^bb13(%173 : i64)
^bb24: // pred: ^bb13
%174 = llvm.add %116, %0 : i64
llvm.br ^bb12(%174 : i64)
^bb25: // pred: ^bb12
%175 = llvm.add %93, %1 : i64
llvm.br ^bb10(%175 : i64)
^bb26: // pred: ^bb10
%176 = llvm.add %91, %1 : i64
llvm.br ^bb9(%176 : i64)
^bb27: // pred: ^bb9
%177 = llvm.add %49, %48 : i64
llvm.br ^bb3(%177 : i64)
^bb28: // pred: ^bb3
%178 = llvm.add %45, %44 : i64
llvm.br ^bb1(%178 : i64)
^bb29: // pred: ^bb1
%179 = llvm.mlir.constant(0 : i32) : i32
llvm.return %179 : i32
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass //----- //
hal.executable private @matmul_test_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_xw_external, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> {
hal.executable.entry_point public @matmul_test_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg0]
%1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1]
hal.return %0, %1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
llvm.func internal @matmul_test_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(0 : index) : i64
%1 = llvm.mlir.constant(10 : index) : i64
%2 = llvm.mlir.constant(0.000000e+00 : f32) : f32
%3 = llvm.mlir.constant(1.000000e+00 : f32) : f32
%4 = llvm.mlir.constant(1 : index) : i64
%5 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%6 = llvm.extractvalue %5[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%7 = llvm.mlir.constant(0 : i64) : i64
%8 = llvm.getelementptr %6[%7] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%9 = llvm.load %8 : !llvm.ptr<ptr<i8>>
%10 = llvm.getelementptr %9[%0] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%11 = llvm.bitcast %10 : !llvm.ptr<i8> to !llvm.ptr<f32>
%12 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%13 = llvm.extractvalue %12[0] : !llvm.array<3 x i32>
%14 = llvm.zext %13 : i32 to i64
%15 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%16 = llvm.extractvalue %15[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%17 = llvm.extractvalue %16[0] : !llvm.array<3 x i32>
%18 = llvm.zext %17 : i32 to i64
%19 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%20 = llvm.extractvalue %19[1] : !llvm.array<3 x i32>
%21 = llvm.zext %20 : i32 to i64
%22 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%23 = llvm.extractvalue %22[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%24 = llvm.extractvalue %23[1] : !llvm.array<3 x i32>
%25 = llvm.zext %24 : i32 to i64
%26 = llvm.mlir.constant(64 : index) : i64
%27 = llvm.mul %21, %26 : i64
%28 = llvm.mul %25, %26 : i64
llvm.br ^bb1(%27 : i64)
^bb1(%29: i64): // 2 preds: ^bb0, ^bb10
%30 = llvm.icmp "slt" %29, %1 : i64
llvm.cond_br %30, ^bb2, ^bb11
^bb2: // pred: ^bb1
%31 = llvm.mul %14, %26 : i64
%32 = llvm.mul %18, %26 : i64
llvm.br ^bb3(%31 : i64)
^bb3(%33: i64): // 2 preds: ^bb2, ^bb9
%34 = llvm.icmp "slt" %33, %1 : i64
llvm.cond_br %34, ^bb4, ^bb10
^bb4: // pred: ^bb3
%35 = llvm.mlir.constant(-1 : index) : i64
%36 = llvm.mul %29, %35 : i64
%37 = llvm.add %36, %1 : i64
%38 = llvm.icmp "slt" %26, %37 : i64
%39 = llvm.select %38, %26, %37 : i1, i64
%40 = llvm.mul %33, %35 : i64
%41 = llvm.add %40, %1 : i64
%42 = llvm.icmp "slt" %26, %41 : i64
%43 = llvm.select %42, %26, %41 : i1, i64
%44 = llvm.bitcast %11 : !llvm.ptr<f32> to !llvm.ptr<f32>
%45 = llvm.mul %29, %1 : i64
%46 = llvm.add %0, %45 : i64
%47 = llvm.mul %33, %4 : i64
%48 = llvm.add %46, %47 : i64
llvm.br ^bb5(%0 : i64)
^bb5(%49: i64): // 2 preds: ^bb4, ^bb8
%50 = llvm.icmp "slt" %49, %39 : i64
llvm.cond_br %50, ^bb6(%0 : i64), ^bb9
^bb6(%51: i64): // 2 preds: ^bb5, ^bb7
%52 = llvm.icmp "slt" %51, %43 : i64
llvm.cond_br %52, ^bb7, ^bb8
^bb7: // pred: ^bb6
%53 = llvm.add %49, %29 : i64
%54 = llvm.add %51, %33 : i64
%55 = llvm.icmp "eq" %53, %54 : i64
%56 = llvm.select %55, %2, %3 : i1, f32
%57 = llvm.mul %49, %1 : i64
%58 = llvm.add %48, %57 : i64
%59 = llvm.add %58, %51 : i64
%60 = llvm.getelementptr %44[%59] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
llvm.store %56, %60 : !llvm.ptr<f32>
%61 = llvm.add %51, %4 : i64
llvm.br ^bb6(%61 : i64)
^bb8: // pred: ^bb6
%62 = llvm.add %49, %4 : i64
llvm.br ^bb5(%62 : i64)
^bb9: // pred: ^bb5
%63 = llvm.add %33, %32 : i64
llvm.br ^bb3(%63 : i64)
^bb10: // pred: ^bb3
%64 = llvm.add %29, %28 : i64
llvm.br ^bb1(%64 : i64)
^bb11: // pred: ^bb1
%65 = llvm.mlir.constant(0 : i32) : i32
llvm.return %65 : i32
}
}
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateTargetExecutableVariantsPass //----- //
hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> {
hal.executable.entry_point public @matmul_test_dispatch_2 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg0]
%1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1]
hal.return %0, %1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
llvm.func internal @matmul_test_dispatch_2(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(4 : index) : i64
%1 = llvm.mlir.constant(32 : index) : i64
%2 = llvm.mlir.constant(0 : index) : i64
%3 = llvm.mlir.constant(10 : index) : i64
%4 = llvm.mlir.constant(0.000000e+00 : f32) : f32
%5 = llvm.mlir.constant(1.000000e+00 : f32) : f32
%6 = llvm.mlir.constant(1 : index) : i64
%7 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%8 = llvm.extractvalue %7[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%9 = llvm.mlir.constant(0 : i64) : i64
%10 = llvm.getelementptr %8[%9] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%11 = llvm.load %10 : !llvm.ptr<ptr<i8>>
%12 = llvm.getelementptr %11[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%13 = llvm.bitcast %12 : !llvm.ptr<i8> to !llvm.ptr<f32>
%14 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%15 = llvm.extractvalue %14[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%16 = llvm.mlir.constant(1 : i64) : i64
%17 = llvm.getelementptr %15[%16] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%18 = llvm.load %17 : !llvm.ptr<ptr<i8>>
%19 = llvm.getelementptr %18[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<f32>
%21 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.mlir.constant(2 : i64) : i64
%24 = llvm.getelementptr %22[%23] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%25 = llvm.load %24 : !llvm.ptr<ptr<i8>>
%26 = llvm.getelementptr %25[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%27 = llvm.bitcast %26 : !llvm.ptr<i8> to !llvm.ptr<f32>
%28 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%29 = llvm.extractvalue %28[0] : !llvm.array<3 x i32>
%30 = llvm.zext %29 : i32 to i64
%31 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%32 = llvm.extractvalue %31[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%33 = llvm.extractvalue %32[0] : !llvm.array<3 x i32>
%34 = llvm.zext %33 : i32 to i64
%35 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%36 = llvm.extractvalue %35[1] : !llvm.array<3 x i32>
%37 = llvm.zext %36 : i32 to i64
%38 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%39 = llvm.extractvalue %38[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%40 = llvm.extractvalue %39[1] : !llvm.array<3 x i32>
%41 = llvm.zext %40 : i32 to i64
%42 = llvm.mlir.constant(64 : index) : i64
%43 = llvm.mul %37, %42 : i64
%44 = llvm.mul %41, %42 : i64
llvm.br ^bb1(%43 : i64)
^bb1(%45: i64): // 2 preds: ^bb0, ^bb28
%46 = llvm.icmp "slt" %45, %3 : i64
llvm.cond_br %46, ^bb2, ^bb29
^bb2: // pred: ^bb1
%47 = llvm.mul %30, %42 : i64
%48 = llvm.mul %34, %42 : i64
llvm.br ^bb3(%47 : i64)
^bb3(%49: i64): // 2 preds: ^bb2, ^bb27
%50 = llvm.icmp "slt" %49, %3 : i64
llvm.cond_br %50, ^bb4, ^bb28
^bb4: // pred: ^bb3
%51 = llvm.mlir.constant(-1 : index) : i64
%52 = llvm.mul %45, %51 : i64
%53 = llvm.add %52, %3 : i64
%54 = llvm.icmp "slt" %42, %53 : i64
%55 = llvm.select %54, %42, %53 : i1, i64
%56 = llvm.bitcast %13 : !llvm.ptr<f32> to !llvm.ptr<f32>
%57 = llvm.mul %45, %3 : i64
%58 = llvm.add %2, %57 : i64
%59 = llvm.mul %9, %6 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mlir.constant(10 : i64) : i64
%62 = llvm.mul %49, %51 : i64
%63 = llvm.add %62, %3 : i64
%64 = llvm.icmp "slt" %42, %63 : i64
%65 = llvm.select %64, %42, %63 : i1, i64
%66 = llvm.bitcast %20 : !llvm.ptr<f32> to !llvm.ptr<f32>
%67 = llvm.mul %9, %3 : i64
%68 = llvm.add %2, %67 : i64
%69 = llvm.mul %49, %6 : i64
%70 = llvm.add %68, %69 : i64
%71 = llvm.icmp "slt" %53, %42 : i64
%72 = llvm.select %71, %53, %42 : i1, i64
%73 = llvm.icmp "slt" %63, %42 : i64
%74 = llvm.select %73, %63, %42 : i1, i64
%75 = llvm.bitcast %27 : !llvm.ptr<f32> to !llvm.ptr<f32>
%76 = llvm.add %58, %69 : i64
llvm.br ^bb5(%2 : i64)
^bb5(%77: i64): // 2 preds: ^bb4, ^bb8
%78 = llvm.icmp "slt" %77, %72 : i64
llvm.cond_br %78, ^bb6(%2 : i64), ^bb9(%2 : i64)
^bb6(%79: i64): // 2 preds: ^bb5, ^bb7
%80 = llvm.icmp "slt" %79, %74 : i64
llvm.cond_br %80, ^bb7, ^bb8
^bb7: // pred: ^bb6
%81 = llvm.add %77, %45 : i64
%82 = llvm.add %79, %49 : i64
%83 = llvm.icmp "eq" %81, %82 : i64
%84 = llvm.select %83, %4, %5 : i1, f32
%85 = llvm.mul %77, %3 : i64
%86 = llvm.add %76, %85 : i64
%87 = llvm.add %86, %79 : i64
%88 = llvm.getelementptr %75[%87] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
llvm.store %84, %88 : !llvm.ptr<f32>
%89 = llvm.add %79, %6 : i64
llvm.br ^bb6(%89 : i64)
^bb8: // pred: ^bb6
%90 = llvm.add %77, %6 : i64
llvm.br ^bb5(%90 : i64)
^bb9(%91: i64): // 2 preds: ^bb5, ^bb26
%92 = llvm.icmp "slt" %91, %55 : i64
llvm.cond_br %92, ^bb10(%2 : i64), ^bb27
^bb10(%93: i64): // 2 preds: ^bb9, ^bb25
%94 = llvm.icmp "slt" %93, %65 : i64
llvm.cond_br %94, ^bb11, ^bb26
^bb11: // pred: ^bb10
%95 = llvm.mul %91, %51 : i64
%96 = llvm.add %55, %95 : i64
%97 = llvm.icmp "slt" %1, %96 : i64
%98 = llvm.select %97, %1, %96 : i1, i64
%99 = llvm.bitcast %56 : !llvm.ptr<f32> to !llvm.ptr<f32>
%100 = llvm.mul %91, %61 : i64
%101 = llvm.add %60, %100 : i64
%102 = llvm.mul %9, %16 : i64
%103 = llvm.add %101, %102 : i64
%104 = llvm.mul %93, %51 : i64
%105 = llvm.add %65, %104 : i64
%106 = llvm.icmp "slt" %1, %105 : i64
%107 = llvm.select %106, %1, %105 : i1, i64
%108 = llvm.bitcast %66 : !llvm.ptr<f32> to !llvm.ptr<f32>
%109 = llvm.mul %9, %61 : i64
%110 = llvm.add %70, %109 : i64
%111 = llvm.mul %93, %16 : i64
%112 = llvm.add %110, %111 : i64
%113 = llvm.bitcast %75 : !llvm.ptr<f32> to !llvm.ptr<f32>
%114 = llvm.add %76, %100 : i64
%115 = llvm.add %114, %111 : i64
llvm.br ^bb12(%2 : i64)
^bb12(%116: i64): // 2 preds: ^bb11, ^bb24
%117 = llvm.icmp "slt" %116, %98 : i64
llvm.cond_br %117, ^bb13(%2 : i64), ^bb25
^bb13(%118: i64): // 2 preds: ^bb12, ^bb23
%119 = llvm.icmp "slt" %118, %107 : i64
llvm.cond_br %119, ^bb14(%2 : i64), ^bb24
^bb14(%120: i64): // 2 preds: ^bb13, ^bb22
%121 = llvm.icmp "slt" %120, %3 : i64
llvm.cond_br %121, ^bb15, ^bb23
^bb15: // pred: ^bb14
%122 = llvm.mul %116, %51 : i64
%123 = llvm.add %98, %122 : i64
%124 = llvm.icmp "slt" %0, %123 : i64
%125 = llvm.select %124, %0, %123 : i1, i64
%126 = llvm.mul %120, %51 : i64
%127 = llvm.add %126, %3 : i64
%128 = llvm.icmp "slt" %0, %127 : i64
%129 = llvm.select %128, %0, %127 : i1, i64
%130 = llvm.bitcast %99 : !llvm.ptr<f32> to !llvm.ptr<f32>
%131 = llvm.mul %116, %61 : i64
%132 = llvm.add %103, %131 : i64
%133 = llvm.mul %120, %16 : i64
%134 = llvm.add %132, %133 : i64
%135 = llvm.mul %118, %51 : i64
%136 = llvm.add %107, %135 : i64
%137 = llvm.icmp "slt" %0, %136 : i64
%138 = llvm.select %137, %0, %136 : i1, i64
%139 = llvm.bitcast %108 : !llvm.ptr<f32> to !llvm.ptr<f32>
%140 = llvm.mul %120, %61 : i64
%141 = llvm.add %112, %140 : i64
%142 = llvm.mul %118, %16 : i64
%143 = llvm.add %141, %142 : i64
%144 = llvm.bitcast %113 : !llvm.ptr<f32> to !llvm.ptr<f32>
%145 = llvm.add %115, %131 : i64
%146 = llvm.add %145, %142 : i64
llvm.br ^bb16(%2 : i64)
^bb16(%147: i64): // 2 preds: ^bb15, ^bb21
%148 = llvm.icmp "slt" %147, %125 : i64
llvm.cond_br %148, ^bb17(%2 : i64), ^bb22
^bb17(%149: i64): // 2 preds: ^bb16, ^bb20
%150 = llvm.icmp "slt" %149, %138 : i64
llvm.cond_br %150, ^bb18(%2 : i64), ^bb21
^bb18(%151: i64): // 2 preds: ^bb17, ^bb19
%152 = llvm.icmp "slt" %151, %129 : i64
llvm.cond_br %152, ^bb19, ^bb20
^bb19: // pred: ^bb18
%153 = llvm.mul %147, %3 : i64
%154 = llvm.add %134, %153 : i64
%155 = llvm.add %154, %151 : i64
%156 = llvm.getelementptr %130[%155] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%157 = llvm.load %156 : !llvm.ptr<f32>
%158 = llvm.mul %151, %3 : i64
%159 = llvm.add %143, %158 : i64
%160 = llvm.add %159, %149 : i64
%161 = llvm.getelementptr %139[%160] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%162 = llvm.load %161 : !llvm.ptr<f32>
%163 = llvm.add %146, %153 : i64
%164 = llvm.add %163, %149 : i64
%165 = llvm.getelementptr %144[%164] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%166 = llvm.load %165 : !llvm.ptr<f32>
%167 = llvm.fmul %157, %162 : f32
%168 = llvm.fadd %166, %167 : f32
llvm.store %168, %165 : !llvm.ptr<f32>
%169 = llvm.add %151, %6 : i64
llvm.br ^bb18(%169 : i64)
^bb20: // pred: ^bb18
%170 = llvm.add %149, %6 : i64
llvm.br ^bb17(%170 : i64)
^bb21: // pred: ^bb17
%171 = llvm.add %147, %6 : i64
llvm.br ^bb16(%171 : i64)
^bb22: // pred: ^bb16
%172 = llvm.add %120, %0 : i64
llvm.br ^bb14(%172 : i64)
^bb23: // pred: ^bb14
%173 = llvm.add %118, %0 : i64
llvm.br ^bb13(%173 : i64)
^bb24: // pred: ^bb13
%174 = llvm.add %116, %0 : i64
llvm.br ^bb12(%174 : i64)
^bb25: // pred: ^bb12
%175 = llvm.add %93, %1 : i64
llvm.br ^bb10(%175 : i64)
^bb26: // pred: ^bb10
%176 = llvm.add %91, %1 : i64
llvm.br ^bb9(%176 : i64)
^bb27: // pred: ^bb9
%177 = llvm.add %49, %48 : i64
llvm.br ^bb3(%177 : i64)
^bb28: // pred: ^bb3
%178 = llvm.add %45, %44 : i64
llvm.br ^bb1(%178 : i64)
^bb29: // pred: ^bb1
%179 = llvm.mlir.constant(0 : i32) : i32
llvm.return %179 : i32
}
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass //----- //
hal.executable private @matmul_test_dispatch_2 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> {
hal.executable.entry_point public @matmul_test_dispatch_2 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg0]
%1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1]
hal.return %0, %1, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
llvm.func internal @matmul_test_dispatch_2(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>) -> i32 attributes {sym_visibility = "private"} {
%0 = llvm.mlir.constant(4 : index) : i64
%1 = llvm.mlir.constant(32 : index) : i64
%2 = llvm.mlir.constant(0 : index) : i64
%3 = llvm.mlir.constant(10 : index) : i64
%4 = llvm.mlir.constant(0.000000e+00 : f32) : f32
%5 = llvm.mlir.constant(1.000000e+00 : f32) : f32
%6 = llvm.mlir.constant(1 : index) : i64
%7 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%8 = llvm.extractvalue %7[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%9 = llvm.mlir.constant(0 : i64) : i64
%10 = llvm.getelementptr %8[%9] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%11 = llvm.load %10 : !llvm.ptr<ptr<i8>>
%12 = llvm.getelementptr %11[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%13 = llvm.bitcast %12 : !llvm.ptr<i8> to !llvm.ptr<f32>
%14 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%15 = llvm.extractvalue %14[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%16 = llvm.mlir.constant(1 : i64) : i64
%17 = llvm.getelementptr %15[%16] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%18 = llvm.load %17 : !llvm.ptr<ptr<i8>>
%19 = llvm.getelementptr %18[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%20 = llvm.bitcast %19 : !llvm.ptr<i8> to !llvm.ptr<f32>
%21 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%22 = llvm.extractvalue %21[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%23 = llvm.mlir.constant(2 : i64) : i64
%24 = llvm.getelementptr %22[%23] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%25 = llvm.load %24 : !llvm.ptr<ptr<i8>>
%26 = llvm.getelementptr %25[%2] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%27 = llvm.bitcast %26 : !llvm.ptr<i8> to !llvm.ptr<f32>
%28 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%29 = llvm.extractvalue %28[0] : !llvm.array<3 x i32>
%30 = llvm.zext %29 : i32 to i64
%31 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%32 = llvm.extractvalue %31[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%33 = llvm.extractvalue %32[0] : !llvm.array<3 x i32>
%34 = llvm.zext %33 : i32 to i64
%35 = llvm.load %arg1 : !llvm.ptr<array<3 x i32>>
%36 = llvm.extractvalue %35[1] : !llvm.array<3 x i32>
%37 = llvm.zext %36 : i32 to i64
%38 = llvm.load %arg0 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>
%39 = llvm.extractvalue %38[0] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%40 = llvm.extractvalue %39[1] : !llvm.array<3 x i32>
%41 = llvm.zext %40 : i32 to i64
%42 = llvm.mlir.constant(64 : index) : i64
%43 = llvm.mul %37, %42 : i64
%44 = llvm.mul %41, %42 : i64
llvm.br ^bb1(%43 : i64)
^bb1(%45: i64): // 2 preds: ^bb0, ^bb28
%46 = llvm.icmp "slt" %45, %3 : i64
llvm.cond_br %46, ^bb2, ^bb29
^bb2: // pred: ^bb1
%47 = llvm.mul %30, %42 : i64
%48 = llvm.mul %34, %42 : i64
llvm.br ^bb3(%47 : i64)
^bb3(%49: i64): // 2 preds: ^bb2, ^bb27
%50 = llvm.icmp "slt" %49, %3 : i64
llvm.cond_br %50, ^bb4, ^bb28
^bb4: // pred: ^bb3
%51 = llvm.mlir.constant(-1 : index) : i64
%52 = llvm.mul %45, %51 : i64
%53 = llvm.add %52, %3 : i64
%54 = llvm.icmp "slt" %42, %53 : i64
%55 = llvm.select %54, %42, %53 : i1, i64
%56 = llvm.bitcast %13 : !llvm.ptr<f32> to !llvm.ptr<f32>
%57 = llvm.mul %45, %3 : i64
%58 = llvm.add %2, %57 : i64
%59 = llvm.mul %9, %6 : i64
%60 = llvm.add %58, %59 : i64
%61 = llvm.mlir.constant(10 : i64) : i64
%62 = llvm.mul %49, %51 : i64
%63 = llvm.add %62, %3 : i64
%64 = llvm.icmp "slt" %42, %63 : i64
%65 = llvm.select %64, %42, %63 : i1, i64
%66 = llvm.bitcast %20 : !llvm.ptr<f32> to !llvm.ptr<f32>
%67 = llvm.mul %9, %3 : i64
%68 = llvm.add %2, %67 : i64
%69 = llvm.mul %49, %6 : i64
%70 = llvm.add %68, %69 : i64
%71 = llvm.icmp "slt" %53, %42 : i64
%72 = llvm.select %71, %53, %42 : i1, i64
%73 = llvm.icmp "slt" %63, %42 : i64
%74 = llvm.select %73, %63, %42 : i1, i64
%75 = llvm.bitcast %27 : !llvm.ptr<f32> to !llvm.ptr<f32>
%76 = llvm.add %58, %69 : i64
llvm.br ^bb5(%2 : i64)
^bb5(%77: i64): // 2 preds: ^bb4, ^bb8
%78 = llvm.icmp "slt" %77, %72 : i64
llvm.cond_br %78, ^bb6(%2 : i64), ^bb9(%2 : i64)
^bb6(%79: i64): // 2 preds: ^bb5, ^bb7
%80 = llvm.icmp "slt" %79, %74 : i64
llvm.cond_br %80, ^bb7, ^bb8
^bb7: // pred: ^bb6
%81 = llvm.add %77, %45 : i64
%82 = llvm.add %79, %49 : i64
%83 = llvm.icmp "eq" %81, %82 : i64
%84 = llvm.select %83, %4, %5 : i1, f32
%85 = llvm.mul %77, %3 : i64
%86 = llvm.add %76, %85 : i64
%87 = llvm.add %86, %79 : i64
%88 = llvm.getelementptr %75[%87] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
llvm.store %84, %88 : !llvm.ptr<f32>
%89 = llvm.add %79, %6 : i64
llvm.br ^bb6(%89 : i64)
^bb8: // pred: ^bb6
%90 = llvm.add %77, %6 : i64
llvm.br ^bb5(%90 : i64)
^bb9(%91: i64): // 2 preds: ^bb5, ^bb26
%92 = llvm.icmp "slt" %91, %55 : i64
llvm.cond_br %92, ^bb10(%2 : i64), ^bb27
^bb10(%93: i64): // 2 preds: ^bb9, ^bb25
%94 = llvm.icmp "slt" %93, %65 : i64
llvm.cond_br %94, ^bb11, ^bb26
^bb11: // pred: ^bb10
%95 = llvm.mul %91, %51 : i64
%96 = llvm.add %55, %95 : i64
%97 = llvm.icmp "slt" %1, %96 : i64
%98 = llvm.select %97, %1, %96 : i1, i64
%99 = llvm.bitcast %56 : !llvm.ptr<f32> to !llvm.ptr<f32>
%100 = llvm.mul %91, %61 : i64
%101 = llvm.add %60, %100 : i64
%102 = llvm.mul %9, %16 : i64
%103 = llvm.add %101, %102 : i64
%104 = llvm.mul %93, %51 : i64
%105 = llvm.add %65, %104 : i64
%106 = llvm.icmp "slt" %1, %105 : i64
%107 = llvm.select %106, %1, %105 : i1, i64
%108 = llvm.bitcast %66 : !llvm.ptr<f32> to !llvm.ptr<f32>
%109 = llvm.mul %9, %61 : i64
%110 = llvm.add %70, %109 : i64
%111 = llvm.mul %93, %16 : i64
%112 = llvm.add %110, %111 : i64
%113 = llvm.bitcast %75 : !llvm.ptr<f32> to !llvm.ptr<f32>
%114 = llvm.add %76, %100 : i64
%115 = llvm.add %114, %111 : i64
llvm.br ^bb12(%2 : i64)
^bb12(%116: i64): // 2 preds: ^bb11, ^bb24
%117 = llvm.icmp "slt" %116, %98 : i64
llvm.cond_br %117, ^bb13(%2 : i64), ^bb25
^bb13(%118: i64): // 2 preds: ^bb12, ^bb23
%119 = llvm.icmp "slt" %118, %107 : i64
llvm.cond_br %119, ^bb14(%2 : i64), ^bb24
^bb14(%120: i64): // 2 preds: ^bb13, ^bb22
%121 = llvm.icmp "slt" %120, %3 : i64
llvm.cond_br %121, ^bb15, ^bb23
^bb15: // pred: ^bb14
%122 = llvm.mul %116, %51 : i64
%123 = llvm.add %98, %122 : i64
%124 = llvm.icmp "slt" %0, %123 : i64
%125 = llvm.select %124, %0, %123 : i1, i64
%126 = llvm.mul %120, %51 : i64
%127 = llvm.add %126, %3 : i64
%128 = llvm.icmp "slt" %0, %127 : i64
%129 = llvm.select %128, %0, %127 : i1, i64
%130 = llvm.bitcast %99 : !llvm.ptr<f32> to !llvm.ptr<f32>
%131 = llvm.mul %116, %61 : i64
%132 = llvm.add %103, %131 : i64
%133 = llvm.mul %120, %16 : i64
%134 = llvm.add %132, %133 : i64
%135 = llvm.mul %118, %51 : i64
%136 = llvm.add %107, %135 : i64
%137 = llvm.icmp "slt" %0, %136 : i64
%138 = llvm.select %137, %0, %136 : i1, i64
%139 = llvm.bitcast %108 : !llvm.ptr<f32> to !llvm.ptr<f32>
%140 = llvm.mul %120, %61 : i64
%141 = llvm.add %112, %140 : i64
%142 = llvm.mul %118, %16 : i64
%143 = llvm.add %141, %142 : i64
%144 = llvm.bitcast %113 : !llvm.ptr<f32> to !llvm.ptr<f32>
%145 = llvm.add %115, %131 : i64
%146 = llvm.add %145, %142 : i64
llvm.br ^bb16(%2 : i64)
^bb16(%147: i64): // 2 preds: ^bb15, ^bb21
%148 = llvm.icmp "slt" %147, %125 : i64
llvm.cond_br %148, ^bb17(%2 : i64), ^bb22
^bb17(%149: i64): // 2 preds: ^bb16, ^bb20
%150 = llvm.icmp "slt" %149, %138 : i64
llvm.cond_br %150, ^bb18(%2 : i64), ^bb21
^bb18(%151: i64): // 2 preds: ^bb17, ^bb19
%152 = llvm.icmp "slt" %151, %129 : i64
llvm.cond_br %152, ^bb19, ^bb20
^bb19: // pred: ^bb18
%153 = llvm.mul %147, %3 : i64
%154 = llvm.add %134, %153 : i64
%155 = llvm.add %154, %151 : i64
%156 = llvm.getelementptr %130[%155] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%157 = llvm.load %156 : !llvm.ptr<f32>
%158 = llvm.mul %151, %3 : i64
%159 = llvm.add %143, %158 : i64
%160 = llvm.add %159, %149 : i64
%161 = llvm.getelementptr %139[%160] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%162 = llvm.load %161 : !llvm.ptr<f32>
%163 = llvm.add %146, %153 : i64
%164 = llvm.add %163, %149 : i64
%165 = llvm.getelementptr %144[%164] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%166 = llvm.load %165 : !llvm.ptr<f32>
%167 = llvm.fmul %157, %162 : f32
%168 = llvm.fadd %166, %167 : f32
llvm.store %168, %165 : !llvm.ptr<f32>
%169 = llvm.add %151, %6 : i64
llvm.br ^bb18(%169 : i64)
^bb20: // pred: ^bb18
%170 = llvm.add %149, %6 : i64
llvm.br ^bb17(%170 : i64)
^bb21: // pred: ^bb17
%171 = llvm.add %147, %6 : i64
llvm.br ^bb16(%171 : i64)
^bb22: // pred: ^bb16
%172 = llvm.add %120, %0 : i64
llvm.br ^bb14(%172 : i64)
^bb23: // pred: ^bb14
%173 = llvm.add %118, %0 : i64
llvm.br ^bb13(%173 : i64)
^bb24: // pred: ^bb13
%174 = llvm.add %116, %0 : i64
llvm.br ^bb12(%174 : i64)
^bb25: // pred: ^bb12
%175 = llvm.add %93, %1 : i64
llvm.br ^bb10(%175 : i64)
^bb26: // pred: ^bb10
%176 = llvm.add %91, %1 : i64
llvm.br ^bb9(%176 : i64)
^bb27: // pred: ^bb9
%177 = llvm.add %49, %48 : i64
llvm.br ^bb3(%177 : i64)
^bb28: // pred: ^bb3
%178 = llvm.add %45, %44 : i64
llvm.br ^bb1(%178 : i64)
^bb29: // pred: ^bb1
%179 = llvm.mlir.constant(0 : i32) : i32
llvm.return %179 : i32
}
}
}
}
<stdin>:11:10: error: unhandled multiple roots in dispatch region
%0 = linalg.generic {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) {
^
<stdin>:60:10: note: called from
%7 = call @expected(%1, %3, %5) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
^
<stdin>:11:10: note: see current operation: %30 = "linalg.generic"(%18, %20, %29) ( {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%31 = "std.mulf"(%arg2, %arg3) : (f32, f32) -> f32
%32 = "std.addf"(%31, %arg4) : (f32, f32) -> f32
"linalg.yield"(%32) : (f32) -> ()
}) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x10xf32>, tensor<10x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
%0 = linalg.generic {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) {
^
<stdin>:11:10: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
%0 = linalg.generic {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) {
^
<stdin>:60:10: note: called from
%7 = call @expected(%1, %3, %5) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
^
<stdin>:11:10: note: see current operation: "hal.executable.variant"() ( {
"hal.executable.entry_point"() {interface = @io, ordinal = 0 : index, sym_name = "matmul_test_dispatch_3", translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} : () -> ()
"builtin.module"() ( {
"builtin.func"() ( {
%0 = "std.constant"() {value = 1.000000e+00 : f32} : () -> f32
%1 = "std.constant"() {value = 0.000000e+00 : f32} : () -> f32
%2 = "std.constant"() {value = 10 : index} : () -> index
%3 = "std.constant"() {value = 0 : index} : () -> index
%4 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b0_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32>
%5 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b1_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32>
%6 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b2_xw_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<writeonly:10x10xf32>
%7 = "hal.interface.workgroup.size"() {dimension = 0 : index} : () -> index
%8 = "hal.interface.workgroup.size"() {dimension = 1 : index} : () -> index
%9 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
%10 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
%11 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
%12 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index
%13 = "affine.apply"(%11, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
%14 = "affine.apply"(%12, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
"scf.for"(%13, %2, %14) ( {
^bb0(%arg0: index): // no predecessors
%15 = "affine.apply"(%9, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
%16 = "affine.apply"(%10, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
"scf.for"(%15, %2, %16) ( {
^bb0(%arg1: index): // no predecessors
%17 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%18 = "flow.dispatch.tensor.load"(%4, %arg0, %17) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [-9223372036854775808, 0], static_sizes = [-1, 10], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<?x10xf32>
%19 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%20 = "flow.dispatch.tensor.load"(%5, %arg1, %19) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [0, -9223372036854775808], static_sizes = [10, -1], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<10x?xf32>
%21 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%22 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%23 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%24 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%25 = "linalg.init_tensor"(%23, %24) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32>
%26 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%27 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%28 = "linalg.init_tensor"(%26, %27) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32>
%29 = "linalg.generic"(%25, %28) ( {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%31 = "linalg.index"() {dim = 0 : i64} : () -> index
%32 = "affine.apply"(%31, %arg0) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index
%33 = "linalg.index"() {dim = 1 : i64} : () -> index
%34 = "affine.apply"(%33, %arg1) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index
%35 = "std.cmpi"(%32, %34) {predicate = 0 : i64} : (index, index) -> i1
%36 = "std.select"(%35, %1, %0) : (i1, f32, f32) -> f32
"linalg.yield"(%36) : (f32) -> ()
}) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
%30 = "linalg.generic"(%18, %20, %29) ( {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%31 = "std.mulf"(%arg2, %arg3) : (f32, f32) -> f32
%32 = "std.addf"(%31, %arg4) : (f32, f32) -> f32
"linalg.yield"(%32) : (f32) -> ()
}) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x10xf32>, tensor<10x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
"flow.dispatch.tensor.store"(%30, %6, %arg0, %arg1, %21, %22) {operand_segment_sizes = dense<[1, 1, 2, 2, 0]> : vector<5xi32>, static_offsets = [-9223372036854775808, -9223372036854775808], static_sizes = [-1, -1], static_strides = [1, 1]} : (tensor<?x?xf32>, !flow.dispatch.tensor<writeonly:10x10xf32>, index, index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"std.return"() : () -> ()
}) {sym_name = "matmul_test_dispatch_3", type = () -> ()} : () -> ()
"hal.interface"() ( {
"hal.interface.binding"() {access = 1 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_ro_external", type = 7 : i32} : () -> ()
"hal.interface.binding"() {access = 1 : i32, binding = 1 : index, set = 0 : index, sym_name = "s0b1_ro_external", type = 7 : i32} : () -> ()
"hal.interface.binding"() {access = 6 : i32, binding = 2 : index, set = 0 : index, sym_name = "s0b2_xw_external", type = 7 : i32} : () -> ()
"hal.interface_end"() : () -> ()
}) {sym_name = "io", sym_visibility = "private"} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "system_elf_x86_64", target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>} : () -> ()
%0 = linalg.generic {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) {
^
<stdin>:11:10: error: failed to serialize executables
%0 = linalg.generic {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) {
^
<stdin>:60:10: note: called from
%7 = call @expected(%1, %3, %5) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
^
<stdin>:11:10: note: see current operation: "hal.executable"() ( {
"hal.interface"() ( {
"hal.interface.binding"() {access = 1 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_ro_external", type = 7 : i32} : () -> ()
"hal.interface.binding"() {access = 1 : i32, binding = 1 : index, set = 0 : index, sym_name = "s0b1_ro_external", type = 7 : i32} : () -> ()
"hal.interface.binding"() {access = 6 : i32, binding = 2 : index, set = 0 : index, sym_name = "s0b2_xw_external", type = 7 : i32} : () -> ()
"hal.interface_end"() : () -> ()
}) {sym_name = "io"} : () -> ()
"hal.executable.variant"() ( {
"hal.executable.entry_point"() {interface = @io, ordinal = 0 : index, sym_name = "matmul_test_dispatch_3", translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} : () -> ()
"builtin.module"() ( {
"builtin.func"() ( {
%0 = "std.constant"() {value = 1.000000e+00 : f32} : () -> f32
%1 = "std.constant"() {value = 0.000000e+00 : f32} : () -> f32
%2 = "std.constant"() {value = 10 : index} : () -> index
%3 = "std.constant"() {value = 0 : index} : () -> index
%4 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b0_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32>
%5 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b1_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32>
%6 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b2_xw_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<writeonly:10x10xf32>
%7 = "hal.interface.workgroup.size"() {dimension = 0 : index} : () -> index
%8 = "hal.interface.workgroup.size"() {dimension = 1 : index} : () -> index
%9 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
%10 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
%11 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
%12 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index
%13 = "affine.apply"(%11, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
%14 = "affine.apply"(%12, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
"scf.for"(%13, %2, %14) ( {
^bb0(%arg0: index): // no predecessors
%15 = "affine.apply"(%9, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
%16 = "affine.apply"(%10, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
"scf.for"(%15, %2, %16) ( {
^bb0(%arg1: index): // no predecessors
%17 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%18 = "flow.dispatch.tensor.load"(%4, %arg0, %17) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [-9223372036854775808, 0], static_sizes = [-1, 10], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<?x10xf32>
%19 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%20 = "flow.dispatch.tensor.load"(%5, %arg1, %19) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [0, -9223372036854775808], static_sizes = [10, -1], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<10x?xf32>
%21 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%22 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%23 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%24 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%25 = "linalg.init_tensor"(%23, %24) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32>
%26 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%27 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%28 = "linalg.init_tensor"(%26, %27) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32>
%29 = "linalg.generic"(%25, %28) ( {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%31 = "linalg.index"() {dim = 0 : i64} : () -> index
%32 = "affine.apply"(%31, %arg0) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index
%33 = "linalg.index"() {dim = 1 : i64} : () -> index
%34 = "affine.apply"(%33, %arg1) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index
%35 = "std.cmpi"(%32, %34) {predicate = 0 : i64} : (index, index) -> i1
%36 = "std.select"(%35, %1, %0) : (i1, f32, f32) -> f32
"linalg.yield"(%36) : (f32) -> ()
}) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
%30 = "linalg.generic"(%18, %20, %29) ( {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%31 = "std.mulf"(%arg2, %arg3) : (f32, f32) -> f32
%32 = "std.addf"(%31, %arg4) : (f32, f32) -> f32
"linalg.yield"(%32) : (f32) -> ()
}) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x10xf32>, tensor<10x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
"flow.dispatch.tensor.store"(%30, %6, %arg0, %arg1, %21, %22) {operand_segment_sizes = dense<[1, 1, 2, 2, 0]> : vector<5xi32>, static_offsets = [-9223372036854775808, -9223372036854775808], static_sizes = [-1, -1], static_strides = [1, 1]} : (tensor<?x?xf32>, !flow.dispatch.tensor<writeonly:10x10xf32>, index, index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"std.return"() : () -> ()
}) {sym_name = "matmul_test_dispatch_3", type = () -> ()} : () -> ()
"hal.interface"() ( {
"hal.interface.binding"() {access = 1 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_ro_external", type = 7 : i32} : () -> ()
"hal.interface.binding"() {access = 1 : i32, binding = 1 : index, set = 0 : index, sym_name = "s0b1_ro_external", type = 7 : i32} : () -> ()
"hal.interface.binding"() {access = 6 : i32, binding = 2 : index, set = 0 : index, sym_name = "s0b2_xw_external", type = 7 : i32} : () -> ()
"hal.interface_end"() : () -> ()
}) {sym_name = "io", sym_visibility = "private"} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "system_elf_x86_64", target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>} : () -> ()
"hal.executable_end"() : () -> ()
}) {sym_name = "matmul_test_dispatch_3", sym_visibility = "private"} : () -> ()
%0 = linalg.generic {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) {
^
<stdin>:5:1: error: conversion from source -> vm failed
module {
^
<stdin>:5:1: note: see current operation: "builtin.module"() ( {
"hal.executable"() ( {
"hal.interface"() ( {
"hal.interface.binding"() {access = 6 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_xw_external", type = 7 : i32} : () -> ()
"hal.interface_end"() : () -> ()
}) {sym_name = "io"} : () -> ()
"hal.executable.variant"() ( {
"hal.executable.entry_point"() ( {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%0 = "std.constant"() {value = 1 : index} : () -> index
%1 = "affine.apply"(%arg0) {map = affine_map<()[s0] -> (s0 ceildiv 64)>} : (index) -> index
%2 = "affine.apply"(%arg1) {map = affine_map<()[s0] -> (s0 ceildiv 64)>} : (index) -> index
"hal.return"(%1, %2, %0) : (index, index, index) -> ()
}) {interface = @io, ordinal = 0 : index, sym_name = "matmul_test_dispatch_0", translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} : () -> ()
"builtin.module"() ( {
"llvm.func"() ( {
^bb0(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>): // no predecessors
%0 = "llvm.mlir.constant"() {value = 0 : index} : () -> i64
%1 = "llvm.mlir.constant"() {value = 10 : index} : () -> i64
%2 = "llvm.mlir.constant"() {value = 0.000000e+00 : f32} : () -> f32
%3 = "llvm.mlir.constant"() {value = 1.000000e+00 : f32} : () -> f32
%4 = "llvm.mlir.constant"() {value = 1 : index} : () -> i64
%5 = "llvm.load"(%arg0) : (!llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>) -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%6 = "llvm.extractvalue"(%5) {position = [5]} : (!llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>) -> !llvm.ptr<ptr<i8>>
%7 = "llvm.mlir.constant"() {value = 0 : i64} : () -> i64
%8 = "llvm.getelementptr"(%6, %7) : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%9 = "llvm.load"(%8) : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<i8>
%10 = "llvm.getelementptr"(%9, %0) : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%11 = "llvm.bitcast"(%10) : (!llvm.ptr<i8>) -> !llvm.ptr<f32>
%12 = "llvm.load"(%arg1) : (!llvm.ptr<array<3 x i32>>) -> !llvm.array<3 x i32>
%13 = "llvm.extractvalue"(%12) {position = [0]} : (!llvm.array<3 x i32>) -> i32
%14 = "llvm.zext"(%13) : (i32) -> i64
%15 = "llvm.load"(%arg0) : (!llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>) -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%16 = "llvm.extractvalue"(%15) {position = [0]} : (!llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>) -> !llvm.array<3 x i32>
%17 = "llvm.extractvalue"(%16) {position = [0]} : (!llvm.array<3 x i32>) -> i32
%18 = "llvm.zext"(%17) : (i32) -> i64
%19 = "llvm.load"(%arg1) : (!llvm.ptr<array<3 x i32>>) -> !llvm.array<3 x i32>
%20 = "llvm.extractvalue"(%19) {position = [1]} : (!llvm.array<3 x i32>) -> i32
%21 = "llvm.zext"(%20) : (i32) -> i64
%22 = "llvm.load"(%arg0) : (!llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>) -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%23 = "llvm.extractvalue"(%22) {position = [0]} : (!llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>) -> !llvm.array<3 x i32>
%24 = "llvm.extractvalue"(%23) {position = [1]} : (!llvm.array<3 x i32>) -> i32
%25 = "llvm.zext"(%24) : (i32) -> i64
%26 = "llvm.mlir.constant"() {value = 64 : index} : () -> i64
%27 = "llvm.mul"(%21, %26) : (i64, i64) -> i64
%28 = "llvm.mul"(%25, %26) : (i64, i64) -> i64
"llvm.br"(%27)[^bb1] : (i64) -> ()
^bb1(%29: i64): // 2 preds: ^bb0, ^bb10
%30 = "llvm.icmp"(%29, %1) {predicate = 2 : i64} : (i64, i64) -> i1
"llvm.cond_br"(%30)[^bb2, ^bb11] {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (i1) -> ()
^bb2: // pred: ^bb1
%31 = "llvm.mul"(%14, %26) : (i64, i64) -> i64
%32 = "llvm.mul"(%18, %26) : (i64, i64) -> i64
"llvm.br"(%31)[^bb3] : (i64) -> ()
^bb3(%33: i64): // 2 preds: ^bb2, ^bb9
%34 = "llvm.icmp"(%33, %1) {predicate = 2 : i64} : (i64, i64) -> i1
"llvm.cond_br"(%34)[^bb4, ^bb10] {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (i1) -> ()
^bb4: // pred: ^bb3
%35 = "llvm.mlir.constant"() {value = -1 : index} : () -> i64
%36 = "llvm.mul"(%29, %35) : (i64, i64) -> i64
%37 = "llvm.add"(%36, %1) : (i64, i64) -> i64
%38 = "llvm.icmp"(%26, %37) {predicate = 2 : i64} : (i64, i64) -> i1
%39 = "llvm.select"(%38, %26, %37) : (i1, i64, i64) -> i64
%40 = "llvm.mul"(%33, %35) : (i64, i64) -> i64
%41 = "llvm.add"(%40, %1) : (i64, i64) -> i64
%42 = "llvm.icmp"(%26, %41) {predicate = 2 : i64} : (i64, i64) -> i1
%43 = "llvm.select"(%42, %26, %41) : (i1, i64, i64) -> i64
%44 = "llvm.bitcast"(%11) : (!llvm.ptr<f32>) -> !llvm.ptr<f32>
%45 = "llvm.mul"(%29, %1) : (i64, i64) -> i64
%46 = "llvm.add"(%0, %45) : (i64, i64) -> i64
%47 = "llvm.mul"(%33, %4) : (i64, i64) -> i64
%48 = "llvm.add"(%46, %47) : (i64, i64) -> i64
"llvm.br"(%0)[^bb5] : (i64) -> ()
^bb5(%49: i64): // 2 preds: ^bb4, ^bb8
%50 = "llvm.icmp"(%49, %39) {predicate = 2 : i64} : (i64, i64) -> i1
"llvm.cond_br"(%50, %0)[^bb6, ^bb9] {operand_segment_sizes = dense<[1, 1, 0]> : vector<3xi32>} : (i1, i64) -> ()
^bb6(%51: i64): // 2 preds: ^bb5, ^bb7
%52 = "llvm.icmp"(%51, %43) {predicate = 2 : i64} : (i64, i64) -> i1
"llvm.cond_br"(%52)[^bb7, ^bb8] {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (i1) -> ()
^bb7: // pred: ^bb6
%53 = "llvm.add"(%49, %29) : (i64, i64) -> i64
%54 = "llvm.add"(%51, %33) : (i64, i64) -> i64
%55 = "llvm.icmp"(%53, %54) {predicate = 0 : i64} : (i64, i64) -> i1
%56 = "llvm.select"(%55, %2, %3) : (i1, f32, f32) -> f32
%57 = "llvm.mul"(%49, %1) : (i64, i64) -> i64
%58 = "llvm.add"(%48, %57) : (i64, i64) -> i64
%59 = "llvm.add"(%58, %51) : (i64, i64) -> i64
%60 = "llvm.getelementptr"(%44, %59) : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
"llvm.store"(%56, %60) : (f32, !llvm.ptr<f32>) -> ()
%61 = "llvm.add"(%51, %4) : (i64, i64) -> i64
"llvm.br"(%61)[^bb6] : (i64) -> ()
^bb8: // pred: ^bb6
%62 = "llvm.add"(%49, %4) : (i64, i64) -> i64
"llvm.br"(%62)[^bb5] : (i64) -> ()
^bb9: // pred: ^bb5
%63 = "llvm.add"(%33, %32) : (i64, i64) -> i64
"llvm.br"(%63)[^bb3] : (i64) -> ()
^bb10: // pred: ^bb3
%64 = "llvm.add"(%29, %28) : (i64, i64) -> i64
"llvm.br"(%64)[^bb1] : (i64) -> ()
^bb11: // pred: ^bb1
%65 = "llvm.mlir.constant"() {value = 0 : i32} : () -> i32
"llvm.return"(%65) : (i32) -> ()
}) {linkage = #llvm.linkage<internal>, sym_name = "matmul_test_dispatch_0", sym_visibility = "private", type = !llvm.func<i32 (ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, ptr<array<3 x i32>>, ptr<i8>)>} : () -> ()
}) {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "system_elf_x86_64", target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>} : () -> ()
"hal.executable_end"() : () -> ()
}) {sym_name = "matmul_test_dispatch_0", sym_visibility = "private"} : () -> ()
"hal.executable"() ( {
"hal.interface"() ( {
"hal.interface.binding"() {access = 1 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_ro_external", type = 7 : i32} : () -> ()
"hal.interface.binding"() {access = 1 : i32, binding = 1 : index, set = 0 : index, sym_name = "s0b1_ro_external", type = 7 : i32} : () -> ()
"hal.interface.binding"() {access = 6 : i32, binding = 2 : index, set = 0 : index, sym_name = "s0b2_xw_external", type = 7 : i32} : () -> ()
"hal.interface_end"() : () -> ()
}) {sym_name = "io"} : () -> ()
"hal.executable.variant"() ( {
"hal.executable.entry_point"() ( {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%0 = "std.constant"() {value = 1 : index} : () -> index
%1 = "affine.apply"(%arg0) {map = affine_map<()[s0] -> (s0 ceildiv 64)>} : (index) -> index
%2 = "affine.apply"(%arg1) {map = affine_map<()[s0] -> (s0 ceildiv 64)>} : (index) -> index
"hal.return"(%1, %2, %0) : (index, index, index) -> ()
}) {interface = @io, ordinal = 0 : index, sym_name = "matmul_test_dispatch_2", translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} : () -> ()
"builtin.module"() ( {
"llvm.func"() ( {
^bb0(%arg0: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, %arg1: !llvm.ptr<array<3 x i32>>, %arg2: !llvm.ptr<i8>): // no predecessors
%0 = "llvm.mlir.constant"() {value = 4 : index} : () -> i64
%1 = "llvm.mlir.constant"() {value = 32 : index} : () -> i64
%2 = "llvm.mlir.constant"() {value = 0 : index} : () -> i64
%3 = "llvm.mlir.constant"() {value = 10 : index} : () -> i64
%4 = "llvm.mlir.constant"() {value = 0.000000e+00 : f32} : () -> f32
%5 = "llvm.mlir.constant"() {value = 1.000000e+00 : f32} : () -> f32
%6 = "llvm.mlir.constant"() {value = 1 : index} : () -> i64
%7 = "llvm.load"(%arg0) : (!llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>) -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%8 = "llvm.extractvalue"(%7) {position = [5]} : (!llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>) -> !llvm.ptr<ptr<i8>>
%9 = "llvm.mlir.constant"() {value = 0 : i64} : () -> i64
%10 = "llvm.getelementptr"(%8, %9) : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%11 = "llvm.load"(%10) : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<i8>
%12 = "llvm.getelementptr"(%11, %2) : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%13 = "llvm.bitcast"(%12) : (!llvm.ptr<i8>) -> !llvm.ptr<f32>
%14 = "llvm.load"(%arg0) : (!llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>) -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%15 = "llvm.extractvalue"(%14) {position = [5]} : (!llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>) -> !llvm.ptr<ptr<i8>>
%16 = "llvm.mlir.constant"() {value = 1 : i64} : () -> i64
%17 = "llvm.getelementptr"(%15, %16) : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%18 = "llvm.load"(%17) : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<i8>
%19 = "llvm.getelementptr"(%18, %2) : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%20 = "llvm.bitcast"(%19) : (!llvm.ptr<i8>) -> !llvm.ptr<f32>
%21 = "llvm.load"(%arg0) : (!llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>) -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%22 = "llvm.extractvalue"(%21) {position = [5]} : (!llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>) -> !llvm.ptr<ptr<i8>>
%23 = "llvm.mlir.constant"() {value = 2 : i64} : () -> i64
%24 = "llvm.getelementptr"(%22, %23) : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
%25 = "llvm.load"(%24) : (!llvm.ptr<ptr<i8>>) -> !llvm.ptr<i8>
%26 = "llvm.getelementptr"(%25, %2) : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
%27 = "llvm.bitcast"(%26) : (!llvm.ptr<i8>) -> !llvm.ptr<f32>
%28 = "llvm.load"(%arg1) : (!llvm.ptr<array<3 x i32>>) -> !llvm.array<3 x i32>
%29 = "llvm.extractvalue"(%28) {position = [0]} : (!llvm.array<3 x i32>) -> i32
%30 = "llvm.zext"(%29) : (i32) -> i64
%31 = "llvm.load"(%arg0) : (!llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>) -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%32 = "llvm.extractvalue"(%31) {position = [0]} : (!llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>) -> !llvm.array<3 x i32>
%33 = "llvm.extractvalue"(%32) {position = [0]} : (!llvm.array<3 x i32>) -> i32
%34 = "llvm.zext"(%33) : (i32) -> i64
%35 = "llvm.load"(%arg1) : (!llvm.ptr<array<3 x i32>>) -> !llvm.array<3 x i32>
%36 = "llvm.extractvalue"(%35) {position = [1]} : (!llvm.array<3 x i32>) -> i32
%37 = "llvm.zext"(%36) : (i32) -> i64
%38 = "llvm.load"(%arg0) : (!llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>) -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>
%39 = "llvm.extractvalue"(%38) {position = [0]} : (!llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>) -> !llvm.array<3 x i32>
%40 = "llvm.extractvalue"(%39) {position = [1]} : (!llvm.array<3 x i32>) -> i32
%41 = "llvm.zext"(%40) : (i32) -> i64
%42 = "llvm.mlir.constant"() {value = 64 : index} : () -> i64
%43 = "llvm.mul"(%37, %42) : (i64, i64) -> i64
%44 = "llvm.mul"(%41, %42) : (i64, i64) -> i64
"llvm.br"(%43)[^bb1] : (i64) -> ()
^bb1(%45: i64): // 2 preds: ^bb0, ^bb28
%46 = "llvm.icmp"(%45, %3) {predicate = 2 : i64} : (i64, i64) -> i1
"llvm.cond_br"(%46)[^bb2, ^bb29] {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (i1) -> ()
^bb2: // pred: ^bb1
%47 = "llvm.mul"(%30, %42) : (i64, i64) -> i64
%48 = "llvm.mul"(%34, %42) : (i64, i64) -> i64
"llvm.br"(%47)[^bb3] : (i64) -> ()
^bb3(%49: i64): // 2 preds: ^bb2, ^bb27
%50 = "llvm.icmp"(%49, %3) {predicate = 2 : i64} : (i64, i64) -> i1
"llvm.cond_br"(%50)[^bb4, ^bb28] {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (i1) -> ()
^bb4: // pred: ^bb3
%51 = "llvm.mlir.constant"() {value = -1 : index} : () -> i64
%52 = "llvm.mul"(%45, %51) : (i64, i64) -> i64
%53 = "llvm.add"(%52, %3) : (i64, i64) -> i64
%54 = "llvm.icmp"(%42, %53) {predicate = 2 : i64} : (i64, i64) -> i1
%55 = "llvm.select"(%54, %42, %53) : (i1, i64, i64) -> i64
%56 = "llvm.bitcast"(%13) : (!llvm.ptr<f32>) -> !llvm.ptr<f32>
%57 = "llvm.mul"(%45, %3) : (i64, i64) -> i64
%58 = "llvm.add"(%2, %57) : (i64, i64) -> i64
%59 = "llvm.mul"(%9, %6) : (i64, i64) -> i64
%60 = "llvm.add"(%58, %59) : (i64, i64) -> i64
%61 = "llvm.mlir.constant"() {value = 10 : i64} : () -> i64
%62 = "llvm.mul"(%49, %51) : (i64, i64) -> i64
%63 = "llvm.add"(%62, %3) : (i64, i64) -> i64
%64 = "llvm.icmp"(%42, %63) {predicate = 2 : i64} : (i64, i64) -> i1
%65 = "llvm.select"(%64, %42, %63) : (i1, i64, i64) -> i64
%66 = "llvm.bitcast"(%20) : (!llvm.ptr<f32>) -> !llvm.ptr<f32>
%67 = "llvm.mul"(%9, %3) : (i64, i64) -> i64
%68 = "llvm.add"(%2, %67) : (i64, i64) -> i64
%69 = "llvm.mul"(%49, %6) : (i64, i64) -> i64
%70 = "llvm.add"(%68, %69) : (i64, i64) -> i64
%71 = "llvm.icmp"(%53, %42) {predicate = 2 : i64} : (i64, i64) -> i1
%72 = "llvm.select"(%71, %53, %42) : (i1, i64, i64) -> i64
%73 = "llvm.icmp"(%63, %42) {predicate = 2 : i64} : (i64, i64) -> i1
%74 = "llvm.select"(%73, %63, %42) : (i1, i64, i64) -> i64
%75 = "llvm.bitcast"(%27) : (!llvm.ptr<f32>) -> !llvm.ptr<f32>
%76 = "llvm.add"(%58, %69) : (i64, i64) -> i64
"llvm.br"(%2)[^bb5] : (i64) -> ()
^bb5(%77: i64): // 2 preds: ^bb4, ^bb8
%78 = "llvm.icmp"(%77, %72) {predicate = 2 : i64} : (i64, i64) -> i1
"llvm.cond_br"(%78, %2, %2)[^bb6, ^bb9] {operand_segment_sizes = dense<1> : vector<3xi32>} : (i1, i64, i64) -> ()
^bb6(%79: i64): // 2 preds: ^bb5, ^bb7
%80 = "llvm.icmp"(%79, %74) {predicate = 2 : i64} : (i64, i64) -> i1
"llvm.cond_br"(%80)[^bb7, ^bb8] {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (i1) -> ()
^bb7: // pred: ^bb6
%81 = "llvm.add"(%77, %45) : (i64, i64) -> i64
%82 = "llvm.add"(%79, %49) : (i64, i64) -> i64
%83 = "llvm.icmp"(%81, %82) {predicate = 0 : i64} : (i64, i64) -> i1
%84 = "llvm.select"(%83, %4, %5) : (i1, f32, f32) -> f32
%85 = "llvm.mul"(%77, %3) : (i64, i64) -> i64
%86 = "llvm.add"(%76, %85) : (i64, i64) -> i64
%87 = "llvm.add"(%86, %79) : (i64, i64) -> i64
%88 = "llvm.getelementptr"(%75, %87) : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
"llvm.store"(%84, %88) : (f32, !llvm.ptr<f32>) -> ()
%89 = "llvm.add"(%79, %6) : (i64, i64) -> i64
"llvm.br"(%89)[^bb6] : (i64) -> ()
^bb8: // pred: ^bb6
%90 = "llvm.add"(%77, %6) : (i64, i64) -> i64
"llvm.br"(%90)[^bb5] : (i64) -> ()
^bb9(%91: i64): // 2 preds: ^bb5, ^bb26
%92 = "llvm.icmp"(%91, %55) {predicate = 2 : i64} : (i64, i64) -> i1
"llvm.cond_br"(%92, %2)[^bb10, ^bb27] {operand_segment_sizes = dense<[1, 1, 0]> : vector<3xi32>} : (i1, i64) -> ()
^bb10(%93: i64): // 2 preds: ^bb9, ^bb25
%94 = "llvm.icmp"(%93, %65) {predicate = 2 : i64} : (i64, i64) -> i1
"llvm.cond_br"(%94)[^bb11, ^bb26] {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (i1) -> ()
^bb11: // pred: ^bb10
%95 = "llvm.mul"(%91, %51) : (i64, i64) -> i64
%96 = "llvm.add"(%55, %95) : (i64, i64) -> i64
%97 = "llvm.icmp"(%1, %96) {predicate = 2 : i64} : (i64, i64) -> i1
%98 = "llvm.select"(%97, %1, %96) : (i1, i64, i64) -> i64
%99 = "llvm.bitcast"(%56) : (!llvm.ptr<f32>) -> !llvm.ptr<f32>
%100 = "llvm.mul"(%91, %61) : (i64, i64) -> i64
%101 = "llvm.add"(%60, %100) : (i64, i64) -> i64
%102 = "llvm.mul"(%9, %16) : (i64, i64) -> i64
%103 = "llvm.add"(%101, %102) : (i64, i64) -> i64
%104 = "llvm.mul"(%93, %51) : (i64, i64) -> i64
%105 = "llvm.add"(%65, %104) : (i64, i64) -> i64
%106 = "llvm.icmp"(%1, %105) {predicate = 2 : i64} : (i64, i64) -> i1
%107 = "llvm.select"(%106, %1, %105) : (i1, i64, i64) -> i64
%108 = "llvm.bitcast"(%66) : (!llvm.ptr<f32>) -> !llvm.ptr<f32>
%109 = "llvm.mul"(%9, %61) : (i64, i64) -> i64
%110 = "llvm.add"(%70, %109) : (i64, i64) -> i64
%111 = "llvm.mul"(%93, %16) : (i64, i64) -> i64
%112 = "llvm.add"(%110, %111) : (i64, i64) -> i64
%113 = "llvm.bitcast"(%75) : (!llvm.ptr<f32>) -> !llvm.ptr<f32>
%114 = "llvm.add"(%76, %100) : (i64, i64) -> i64
%115 = "llvm.add"(%114, %111) : (i64, i64) -> i64
"llvm.br"(%2)[^bb12] : (i64) -> ()
^bb12(%116: i64): // 2 preds: ^bb11, ^bb24
%117 = "llvm.icmp"(%116, %98) {predicate = 2 : i64} : (i64, i64) -> i1
"llvm.cond_br"(%117, %2)[^bb13, ^bb25] {operand_segment_sizes = dense<[1, 1, 0]> : vector<3xi32>} : (i1, i64) -> ()
^bb13(%118: i64): // 2 preds: ^bb12, ^bb23
%119 = "llvm.icmp"(%118, %107) {predicate = 2 : i64} : (i64, i64) -> i1
"llvm.cond_br"(%119, %2)[^bb14, ^bb24] {operand_segment_sizes = dense<[1, 1, 0]> : vector<3xi32>} : (i1, i64) -> ()
^bb14(%120: i64): // 2 preds: ^bb13, ^bb22
%121 = "llvm.icmp"(%120, %3) {predicate = 2 : i64} : (i64, i64) -> i1
"llvm.cond_br"(%121)[^bb15, ^bb23] {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (i1) -> ()
^bb15: // pred: ^bb14
%122 = "llvm.mul"(%116, %51) : (i64, i64) -> i64
%123 = "llvm.add"(%98, %122) : (i64, i64) -> i64
%124 = "llvm.icmp"(%0, %123) {predicate = 2 : i64} : (i64, i64) -> i1
%125 = "llvm.select"(%124, %0, %123) : (i1, i64, i64) -> i64
%126 = "llvm.mul"(%120, %51) : (i64, i64) -> i64
%127 = "llvm.add"(%126, %3) : (i64, i64) -> i64
%128 = "llvm.icmp"(%0, %127) {predicate = 2 : i64} : (i64, i64) -> i1
%129 = "llvm.select"(%128, %0, %127) : (i1, i64, i64) -> i64
%130 = "llvm.bitcast"(%99) : (!llvm.ptr<f32>) -> !llvm.ptr<f32>
%131 = "llvm.mul"(%116, %61) : (i64, i64) -> i64
%132 = "llvm.add"(%103, %131) : (i64, i64) -> i64
%133 = "llvm.mul"(%120, %16) : (i64, i64) -> i64
%134 = "llvm.add"(%132, %133) : (i64, i64) -> i64
%135 = "llvm.mul"(%118, %51) : (i64, i64) -> i64
%136 = "llvm.add"(%107, %135) : (i64, i64) -> i64
%137 = "llvm.icmp"(%0, %136) {predicate = 2 : i64} : (i64, i64) -> i1
%138 = "llvm.select"(%137, %0, %136) : (i1, i64, i64) -> i64
%139 = "llvm.bitcast"(%108) : (!llvm.ptr<f32>) -> !llvm.ptr<f32>
%140 = "llvm.mul"(%120, %61) : (i64, i64) -> i64
%141 = "llvm.add"(%112, %140) : (i64, i64) -> i64
%142 = "llvm.mul"(%118, %16) : (i64, i64) -> i64
%143 = "llvm.add"(%141, %142) : (i64, i64) -> i64
%144 = "llvm.bitcast"(%113) : (!llvm.ptr<f32>) -> !llvm.ptr<f32>
%145 = "llvm.add"(%115, %131) : (i64, i64) -> i64
%146 = "llvm.add"(%145, %142) : (i64, i64) -> i64
"llvm.br"(%2)[^bb16] : (i64) -> ()
^bb16(%147: i64): // 2 preds: ^bb15, ^bb21
%148 = "llvm.icmp"(%147, %125) {predicate = 2 : i64} : (i64, i64) -> i1
"llvm.cond_br"(%148, %2)[^bb17, ^bb22] {operand_segment_sizes = dense<[1, 1, 0]> : vector<3xi32>} : (i1, i64) -> ()
^bb17(%149: i64): // 2 preds: ^bb16, ^bb20
%150 = "llvm.icmp"(%149, %138) {predicate = 2 : i64} : (i64, i64) -> i1
"llvm.cond_br"(%150, %2)[^bb18, ^bb21] {operand_segment_sizes = dense<[1, 1, 0]> : vector<3xi32>} : (i1, i64) -> ()
^bb18(%151: i64): // 2 preds: ^bb17, ^bb19
%152 = "llvm.icmp"(%151, %129) {predicate = 2 : i64} : (i64, i64) -> i1
"llvm.cond_br"(%152)[^bb19, ^bb20] {operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (i1) -> ()
^bb19: // pred: ^bb18
%153 = "llvm.mul"(%147, %3) : (i64, i64) -> i64
%154 = "llvm.add"(%134, %153) : (i64, i64) -> i64
%155 = "llvm.add"(%154, %151) : (i64, i64) -> i64
%156 = "llvm.getelementptr"(%130, %155) : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%157 = "llvm.load"(%156) : (!llvm.ptr<f32>) -> f32
%158 = "llvm.mul"(%151, %3) : (i64, i64) -> i64
%159 = "llvm.add"(%143, %158) : (i64, i64) -> i64
%160 = "llvm.add"(%159, %149) : (i64, i64) -> i64
%161 = "llvm.getelementptr"(%139, %160) : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%162 = "llvm.load"(%161) : (!llvm.ptr<f32>) -> f32
%163 = "llvm.add"(%146, %153) : (i64, i64) -> i64
%164 = "llvm.add"(%163, %149) : (i64, i64) -> i64
%165 = "llvm.getelementptr"(%144, %164) : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
%166 = "llvm.load"(%165) : (!llvm.ptr<f32>) -> f32
%167 = "llvm.fmul"(%157, %162) : (f32, f32) -> f32
%168 = "llvm.fadd"(%166, %167) : (f32, f32) -> f32
"llvm.store"(%168, %165) : (f32, !llvm.ptr<f32>) -> ()
%169 = "llvm.add"(%151, %6) : (i64, i64) -> i64
"llvm.br"(%169)[^bb18] : (i64) -> ()
^bb20: // pred: ^bb18
%170 = "llvm.add"(%149, %6) : (i64, i64) -> i64
"llvm.br"(%170)[^bb17] : (i64) -> ()
^bb21: // pred: ^bb17
%171 = "llvm.add"(%147, %6) : (i64, i64) -> i64
"llvm.br"(%171)[^bb16] : (i64) -> ()
^bb22: // pred: ^bb16
%172 = "llvm.add"(%120, %0) : (i64, i64) -> i64
"llvm.br"(%172)[^bb14] : (i64) -> ()
^bb23: // pred: ^bb14
%173 = "llvm.add"(%118, %0) : (i64, i64) -> i64
"llvm.br"(%173)[^bb13] : (i64) -> ()
^bb24: // pred: ^bb13
%174 = "llvm.add"(%116, %0) : (i64, i64) -> i64
"llvm.br"(%174)[^bb12] : (i64) -> ()
^bb25: // pred: ^bb12
%175 = "llvm.add"(%93, %1) : (i64, i64) -> i64
"llvm.br"(%175)[^bb10] : (i64) -> ()
^bb26: // pred: ^bb10
%176 = "llvm.add"(%91, %1) : (i64, i64) -> i64
"llvm.br"(%176)[^bb9] : (i64) -> ()
^bb27: // pred: ^bb9
%177 = "llvm.add"(%49, %48) : (i64, i64) -> i64
"llvm.br"(%177)[^bb3] : (i64) -> ()
^bb28: // pred: ^bb3
%178 = "llvm.add"(%45, %44) : (i64, i64) -> i64
"llvm.br"(%178)[^bb1] : (i64) -> ()
^bb29: // pred: ^bb1
%179 = "llvm.mlir.constant"() {value = 0 : i32} : () -> i32
"llvm.return"(%179) : (i32) -> ()
}) {linkage = #llvm.linkage<internal>, sym_name = "matmul_test_dispatch_2", sym_visibility = "private", type = !llvm.func<i32 (ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (array<3 x i32>, array<3 x i32>, i64, ptr<i32>, i64, ptr<ptr<i8>>, ptr<i64>)>>, ptr<array<3 x i32>>, ptr<i8>)>} : () -> ()
}) {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "system_elf_x86_64", target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>} : () -> ()
"hal.executable_end"() : () -> ()
}) {sym_name = "matmul_test_dispatch_2", sym_visibility = "private"} : () -> ()
"hal.executable"() ( {
"hal.interface"() ( {
"hal.interface.binding"() {access = 1 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_ro_external", type = 7 : i32} : () -> ()
"hal.interface.binding"() {access = 1 : i32, binding = 1 : index, set = 0 : index, sym_name = "s0b1_ro_external", type = 7 : i32} : () -> ()
"hal.interface.binding"() {access = 6 : i32, binding = 2 : index, set = 0 : index, sym_name = "s0b2_xw_external", type = 7 : i32} : () -> ()
"hal.interface_end"() : () -> ()
}) {sym_name = "io"} : () -> ()
"hal.executable.variant"() ( {
"hal.executable.entry_point"() {interface = @io, ordinal = 0 : index, sym_name = "matmul_test_dispatch_3", translation.info = {passPipeline = "CPUVectorization", workloadPerWorkgroup = [64, 64]}} : () -> ()
"builtin.module"() ( {
"builtin.func"() ( {
%0 = "std.constant"() {value = 1.000000e+00 : f32} : () -> f32
%1 = "std.constant"() {value = 0.000000e+00 : f32} : () -> f32
%2 = "std.constant"() {value = 10 : index} : () -> index
%3 = "std.constant"() {value = 0 : index} : () -> index
%4 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b0_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32>
%5 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b1_ro_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<readonly:10x10xf32>
%6 = "hal.interface.binding.subspan"(%3) {binding = @io::@s0b2_xw_external, operand_segment_sizes = dense<[1, 0, 0]> : vector<3xi32>} : (index) -> !flow.dispatch.tensor<writeonly:10x10xf32>
%7 = "hal.interface.workgroup.size"() {dimension = 0 : index} : () -> index
%8 = "hal.interface.workgroup.size"() {dimension = 1 : index} : () -> index
%9 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
%10 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
%11 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
%12 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index
%13 = "affine.apply"(%11, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
%14 = "affine.apply"(%12, %8) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
"scf.for"(%13, %2, %14) ( {
^bb0(%arg0: index): // no predecessors
%15 = "affine.apply"(%9, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
%16 = "affine.apply"(%10, %7) {map = affine_map<()[s0, s1] -> (s0 * s1)>} : (index, index) -> index
"scf.for"(%15, %2, %16) ( {
^bb0(%arg1: index): // no predecessors
%17 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%18 = "flow.dispatch.tensor.load"(%4, %arg0, %17) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [-9223372036854775808, 0], static_sizes = [-1, 10], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<?x10xf32>
%19 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%20 = "flow.dispatch.tensor.load"(%5, %arg1, %19) {operand_segment_sizes = dense<[1, 1, 1, 0]> : vector<4xi32>, static_offsets = [0, -9223372036854775808], static_sizes = [10, -1], static_strides = [1, 1]} : (!flow.dispatch.tensor<readonly:10x10xf32>, index, index) -> tensor<10x?xf32>
%21 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%22 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (s0, -d0 + 10)>} : (index, index) -> index
%23 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%24 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%25 = "linalg.init_tensor"(%23, %24) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32>
%26 = "affine.min"(%arg0, %8) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%27 = "affine.min"(%arg1, %7) {map = affine_map<(d0)[s0] -> (-d0 + 10, s0)>} : (index, index) -> index
%28 = "linalg.init_tensor"(%26, %27) {static_sizes = [-1, -1]} : (index, index) -> tensor<?x?xf32>
%29 = "linalg.generic"(%25, %28) ( {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
%31 = "linalg.index"() {dim = 0 : i64} : () -> index
%32 = "affine.apply"(%31, %arg0) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index
%33 = "linalg.index"() {dim = 1 : i64} : () -> index
%34 = "affine.apply"(%33, %arg1) {map = affine_map<(d0, d1) -> (d0 + d1)>} : (index, index) -> index
%35 = "std.cmpi"(%32, %34) {predicate = 0 : i64} : (index, index) -> i1
%36 = "std.select"(%35, %1, %0) : (i1, f32, f32) -> f32
"linalg.yield"(%36) : (f32) -> ()
}) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
%30 = "linalg.generic"(%18, %20, %29) ( {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%31 = "std.mulf"(%arg2, %arg3) : (f32, f32) -> f32
%32 = "std.addf"(%31, %arg4) : (f32, f32) -> f32
"linalg.yield"(%32) : (f32) -> ()
}) {__internal_linalg_transform__ = "workgroup", indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], lowering.config = {tileSizes = [[64, 64]]}, operand_segment_sizes = dense<[2, 1]> : vector<2xi32>} : (tensor<?x10xf32>, tensor<10x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
"flow.dispatch.tensor.store"(%30, %6, %arg0, %arg1, %21, %22) {operand_segment_sizes = dense<[1, 1, 2, 2, 0]> : vector<5xi32>, static_offsets = [-9223372036854775808, -9223372036854775808], static_sizes = [-1, -1], static_strides = [1, 1]} : (tensor<?x?xf32>, !flow.dispatch.tensor<writeonly:10x10xf32>, index, index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"std.return"() : () -> ()
}) {sym_name = "matmul_test_dispatch_3", type = () -> ()} : () -> ()
"hal.interface"() ( {
"hal.interface.binding"() {access = 1 : i32, binding = 0 : index, set = 0 : index, sym_name = "s0b0_ro_external", type = 7 : i32} : () -> ()
"hal.interface.binding"() {access = 1 : i32, binding = 1 : index, set = 0 : index, sym_name = "s0b1_ro_external", type = 7 : i32} : () -> ()
"hal.interface.binding"() {access = 6 : i32, binding = 2 : index, set = 0 : index, sym_name = "s0b2_xw_external", type = 7 : i32} : () -> ()
"hal.interface_end"() : () -> ()
}) {sym_name = "io", sym_visibility = "private"} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "system_elf_x86_64", target = #hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>} : () -> ()
"hal.executable_end"() : () -> ()
}) {sym_name = "matmul_test_dispatch_3", sym_visibility = "private"} : () -> ()
"builtin.func"() ( {
%0 = "std.constant"() {value = 10 : index} : () -> index
%1:2 = "flow.ex.stream.fragment"(%0, %0, %0, %0) ( {
%2 = "std.constant"() {value = 1 : index} : () -> index
%3 = "std.constant"() {value = 10 : index} : () -> index
%4 = "flow.dispatch"(%3, %3, %2) {entry_point = @matmul_test_dispatch_0::@matmul_test_dispatch_0, hal.bindings = [#hal.ex.result_buffer<"s0b0_xw_external", 0 : index>], operand_segment_sizes = dense<[3, 0, 0, 0]> : vector<4xi32>, tied_operands = []} : (index, index, index) -> tensor<10x10xf32>
%5 = "flow.dispatch"(%3, %3, %2, %4, %4) {entry_point = @matmul_test_dispatch_2::@matmul_test_dispatch_2, hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>], operand_segment_sizes = dense<[3, 2, 0, 0]> : vector<4xi32>, tied_operands = []} : (index, index, index, tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%6 = "flow.tensor.reshape"(%5, %3, %3) {operand_segment_sizes = dense<[1, 0, 2]> : vector<3xi32>} : (tensor<10x10xf32>, index, index) -> tensor<?x?xf32>
%7 = "flow.dispatch"(%3, %3, %2, %4, %4) {entry_point = @matmul_test_dispatch_3::@matmul_test_dispatch_3, hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>], operand_segment_sizes = dense<[3, 2, 0, 0]> : vector<4xi32>, tied_operands = []} : (index, index, index, tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
%8 = "flow.tensor.reshape"(%7, %3, %3) {operand_segment_sizes = dense<[1, 0, 2]> : vector<3xi32>} : (tensor<10x10xf32>, index, index) -> tensor<?x?xf32>
"flow.return"(%6, %8) : (tensor<?x?xf32>, tensor<?x?xf32>) -> ()
}) {operand_segment_sizes = dense<[0, 0, 4]> : vector<3xi32>, tied_operands = []} : (index, index, index, index) -> (tensor<?x?xf32>, tensor<?x?xf32>)
"check.expect_eq"(%1#0, %1#1) : (tensor<?x?xf32>, tensor<?x?xf32>) -> ()
"std.return"() : () -> ()
}) {iree.abi.stub, iree.reflection = {MatmulTest = "entry"}, sym_name = "matmul_test", type = () -> ()} : () -> ()
}) {hal.device.targets = [#hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>]}>]} : () -> ()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment