Created
March 24, 2025 16:12
-
-
Save bjacob/1dfc92d50e61865304b9eaf05fcb6a3f to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After CheckVHLOStableHloMixUsage (iree-check-vhlostablehlo-mix-usage) //----- // | |
module { | |
func.func @sort3D() { | |
%0 = util.unfoldable_constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq_const(%1, dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>) : tensor<1x2x4xi32> | |
return | |
} | |
} | |
// -----// IR Dump After VhloToVersionPass (vhlo-to-version) //----- // | |
module { | |
func.func @sort3D() { | |
%0 = util.unfoldable_constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq_const(%1, dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>) : tensor<1x2x4xi32> | |
return | |
} | |
} | |
// -----// IR Dump After VhloLegalizeToStablehloPass (vhlo-legalize-to-stablehlo) //----- // | |
module { | |
func.func @sort3D() { | |
%0 = util.unfoldable_constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq_const(%1, dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>) : tensor<1x2x4xi32> | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
// -----// IR Dump After StableHLOCanonicalize (iree-stablehlo-canonicalize) //----- // | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
// -----// IR Dump After LegalizeStableHLOCustomCalls (iree-stablehlo-legalize-custom-calls) //----- // | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
// -----// IR Dump After LegalizeControlFlow (iree-stablehlo-legalize-control-flow) //----- // | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
// -----// IR Dump After FlattenTuplesInSCF (iree-stablehlo-preprocessing-flatten-scf-tuples) //----- // | |
module { | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
} | |
// -----// IR Dump After StableHLOToStableHLOPreprocessing (iree-stablehlo-to-stablehlo-preprocessing) //----- // | |
module { | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
} | |
// -----// IR Dump After StableHLOCanonicalize (iree-stablehlo-canonicalize) //----- // | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
// -----// IR Dump After ShapeToShapeLoweringPass (shape-to-shape-lowering) //----- // | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
// -----// IR Dump After ConvertShapeToStandardPass (convert-shape-to-std) //----- // | |
module { | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
// -----// IR Dump After StableHLOCanonicalize (iree-stablehlo-canonicalize) //----- // | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
// -----// IR Dump After Inliner (inline) //----- // | |
module { | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
// -----// IR Dump After StableHLOCanonicalize (iree-stablehlo-canonicalize) //----- // | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
// -----// IR Dump After LegalizeShapeComputations (iree-stablehlo-legalize-shape-computations) //----- // | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({ | |
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>): | |
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1> | |
stablehlo.return %2 : tensor<i1> | |
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
// -----// IR Dump After ConvertStableHloToLinalgExt (iree-stablehlo-to-linalg-ext) //----- // | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
// -----// IR Dump After LegalizeChlo (iree-stablehlo-legalize-chlo) //----- // | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
// -----// IR Dump After ConvertStableHloToIreeInputDialects (iree-stablehlo-to-iree-input) //----- // | |
module { | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
} | |
// -----// IR Dump After ReconcileUnrealizedCastsPass (reconcile-unrealized-casts) //----- // | |
module { | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
// -----// IR Dump After StableHLOCanonicalize (iree-stablehlo-canonicalize) //----- // | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
// -----// IR Dump After VerifyCompilerStableHloInputLegality (iree-stablehlo-verify-compiler-input-legality) //----- // | |
module { | |
func.func @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
return | |
} | |
} | |
// -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- // | |
module { | |
util.func public @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ImportMLProgramPass (iree-import-ml-program) //----- // | |
module { | |
util.func public @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After SanitizeModuleNamesPass (iree-sanitize-module-names) //----- // | |
module { | |
util.func public @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- // | |
module { | |
util.func public @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- // | |
module { | |
util.func public @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- // | |
module { | |
util.func public @sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- // | |
module { | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Inliner (inline) //----- // | |
module { | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
module { | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {hal.device.targets = [#device_target_hip]} { | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After AttrBasedPipelinePass (iree-preprocessing-attr-based-pipeline) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After WarnOnUninitializedValuesPass (iree-global-opt-warn-on-uninitialized-values) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After AttrBasedPipelinePass (iree-preprocessing-attr-based-pipeline) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After WarnOnUninitializedValuesPass (iree-global-opt-warn-on-uninitialized-values) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After PropagateLinalgTransposePass (iree-global-opt-propagate-linalg-transpose) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SetEncodingPass (iree-dispatch-creation-set-encoding) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After PropagateLinalgTransposePass (iree-global-opt-propagate-linalg-transpose) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After SetEncodingPass (iree-dispatch-creation-set-encoding) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After MaterializeEncodingIntoNopPass (iree-codegen-materialize-encoding-into-nop) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After MaterializeEncodingIntoNopPass (iree-codegen-materialize-encoding-into-nop) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After MaterializeHomogeneousEncodingsPass (iree-global-opt-materialize-homogeneous-encodings) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After SimplifyPackUnpackPass (iree-global-opt-simplify-pack-unpack) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After DataLayoutPropagationPass (iree-global-opt-data-layout-propagation) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After DataLayoutPropagationPass (iree-global-opt-data-layout-propagation) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After JitGlobalsPass (iree-consteval-jit-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ConvertEncodingToFlowPass (iree-dispatch-creation-convert-encoding-to-flow) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch.region -> (tensor<1x2x4xi32>) { | |
%2 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.return %2 : tensor<1x2x4xi32> | |
} | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch.region -> (tensor<1x2x4xi32>) { | |
%2 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.return %2 : tensor<1x2x4xi32> | |
} | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch.region -> (tensor<1x2x4xi32>) { | |
%2 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.return %2 : tensor<1x2x4xi32> | |
} | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ConvertEncodingToFlowPass (iree-dispatch-creation-convert-encoding-to-flow) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch.region -> (tensor<1x2x4xi32>) { | |
%2 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.return %2 : tensor<1x2x4xi32> | |
} | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch.workgroups(%0) : (tensor<1x2x4xi32>) -> %0 = | |
(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%2 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%3 = iree_linalg_ext.sort dimension(2) outs(%2 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%4 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %4 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %3, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
flow.return | |
} | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch.workgroups(%0) : (tensor<1x2x4xi32>) -> %0 = | |
(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%2 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%3 = iree_linalg_ext.sort dimension(2) outs(%2 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%4 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %4 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %3, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
flow.return | |
} | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch.workgroups(%0) : (tensor<1x2x4xi32>) -> %0 = | |
(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%2 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%3 = iree_linalg_ext.sort dimension(2) outs(%2 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%4 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %4 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %3, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
flow.return | |
} | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch.workgroups(%0) : (tensor<1x2x4xi32>) -> %0 = | |
(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%2 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%3 = iree_linalg_ext.sort dimension(2) outs(%2 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%4 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %4 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %3, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
flow.return | |
} | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch.workgroups(%0) : (tensor<1x2x4xi32>) -> %0 = | |
(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%2 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%3 = iree_linalg_ext.sort dimension(2) outs(%2 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%4 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %4 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %3, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After VerifyInputLegalityPass (iree-verify-input-legality) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch.workgroups(%0) : (tensor<1x2x4xi32>) -> %0 = | |
(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%2 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%3 = iree_linalg_ext.sort dimension(2) outs(%2 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%4 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %4 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %3, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After InitializeEmptyTensorsPass (iree-flow-initialize-empty-tensors) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CaptureDynamicDimsPass (iree-flow-capture-dynamic-dims) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After InitializeEmptyTensorsPass (iree-flow-initialize-empty-tensors) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch.workgroups(%0) : (tensor<1x2x4xi32>) -> %0 = | |
(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%2 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%3 = iree_linalg_ext.sort dimension(2) outs(%2 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%4 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %4 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %3, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CaptureDynamicDimsPass (iree-flow-capture-dynamic-dims) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch.workgroups(%0) : (tensor<1x2x4xi32>) -> %0 = | |
(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%2 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%3 = iree_linalg_ext.sort dimension(2) outs(%2 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%4 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %4 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %3, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch.workgroups(%0) : (tensor<1x2x4xi32>) -> %0 = | |
(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%2 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%3 = iree_linalg_ext.sort dimension(2) outs(%2 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%4 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %4 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %3, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch.workgroups(%0) : (tensor<1x2x4xi32>) -> %0 = | |
(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%2 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%3 = iree_linalg_ext.sort dimension(2) outs(%2 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%4 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %4 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %3, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After OutlineDispatchExternsPass (iree-flow-outline-dispatch-externs) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch.workgroups(%0) : (tensor<1x2x4xi32>) -> %0 = | |
(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%2 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%3 = iree_linalg_ext.sort dimension(2) outs(%2 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%4 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %4 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %3, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After OutlineDispatchRegionsPass (iree-flow-outline-dispatch-regions) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
flow.executable private @_sort3D_dispatch_0 { | |
flow.executable.export public @_sort3D_dispatch_0 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%2 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %1, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After AnnotateDispatchesPass (iree-flow-annotate-dispatches) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
flow.executable private @_sort3D_dispatch_0 { | |
flow.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%2 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %1, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After StripDebugOpsPass (iree-util-strip-debug-ops) //----- // | |
flow.executable private @_sort3D_dispatch_0 { | |
flow.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%2 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %1, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After DeduplicateExecutablesPass (iree-flow-deduplicate-executables) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
flow.executable private @_sort3D_dispatch_0 { | |
flow.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%2 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %1, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CleanupTensorShapesPass (iree-flow-cleanup-tensor-shapes) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CleanupTensorShapesPass (iree-flow-cleanup-tensor-shapes) //----- // | |
util.func private @_sort3D() { | |
%cst = arith.constant dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = arith.constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %cst_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %cst) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After OutlineConstantsPass (iree-flow-outline-constants) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
flow.executable private @_sort3D_dispatch_0 { | |
flow.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%2 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %1, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
util.global private @__constant_tensor_1x2x4xi32_0 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
flow.executable private @_sort3D_dispatch_0 { | |
flow.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%2 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %1, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
util.global private @__constant_tensor_1x2x4xi32_0 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
flow.executable private @_sort3D_dispatch_0 { | |
flow.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%2 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %1, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
util.global private @__constant_tensor_1x2x4xi32_0 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
flow.executable private @_sort3D_dispatch_0 { | |
flow.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%2 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %1, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
util.global private @__constant_tensor_1x2x4xi32_0 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
flow.executable private @_sort3D_dispatch_0 { | |
flow.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%2 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %1, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
util.global private @__constant_tensor_1x2x4xi32_0 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
flow.executable private @_sort3D_dispatch_0 { | |
flow.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%2 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %1, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
util.global private @__constant_tensor_1x2x4xi32_0 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After VerifyInputPass (iree-stream-verify-input) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
flow.executable private @_sort3D_dispatch_0 { | |
flow.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%2 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %1, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
util.global private @__constant_tensor_1x2x4xi32_0 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
flow.executable private @_sort3D_dispatch_0 { | |
flow.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%2 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %1, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
util.global private @__constant_tensor_1x2x4xi32_0 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
flow.executable private @_sort3D_dispatch_0 { | |
flow.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%2 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %1, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
util.global private @__constant_tensor_1x2x4xi32_0 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
flow.executable private @_sort3D_dispatch_0 { | |
flow.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%2 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %1, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
util.global private @__constant_tensor_1x2x4xi32_0 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After CloneToConsumersPass (iree-stream-clone-to-consumers) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
flow.executable private @_sort3D_dispatch_0 { | |
flow.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>>) { | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%1 = iree_linalg_ext.sort dimension(2) outs(%0 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%2 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %1, %arg0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
util.global private @__constant_tensor_1x2x4xi32_0 {inlining_policy = #util.inline.never, stream.affinity.default = #hal.device.affinity<@__device_0>} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : tensor<1x2x4xi32> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%0 = util.optimization_barrier %__constant_tensor_1x2x4xi32_0 : tensor<1x2x4xi32> | |
%1 = flow.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%0) : (tensor<1x2x4xi32>) -> %0 | |
check.expect_eq(%1, %__constant_tensor_1x2x4xi32) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ConvertToStreamPass (iree-stream-conversion) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_1x2x4xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32__size : index | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global private @__constant_tensor_1x2x4xi32_0__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32_0__size : index | |
util.return | |
} | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load @__constant_tensor_1x2x4xi32__size : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32__size} | |
%__constant_tensor_1x2x4xi32_0 = util.global.load @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load @__constant_tensor_1x2x4xi32_0__size : index | |
%1 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%2 = util.optimization_barrier %1 : !stream.resource<*> | |
%3 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%2) : (tensor<1x2x4xi32> in !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> tensor<1x2x4xi32> in %2{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.async.transfer %3 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%6 = stream.async.transfer %0 : !stream.resource<*>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%5, %7) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_1x2x4xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32__size : index | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global private @__constant_tensor_1x2x4xi32_0__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32_0__size : index | |
util.return | |
} | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load @__constant_tensor_1x2x4xi32__size : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32__size} | |
%__constant_tensor_1x2x4xi32_0 = util.global.load @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load @__constant_tensor_1x2x4xi32_0__size : index | |
%1 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%2 = util.optimization_barrier %1 : !stream.resource<*> | |
%3 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%2) : (tensor<1x2x4xi32> in !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> tensor<1x2x4xi32> in %2{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.async.transfer %3 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%6 = stream.async.transfer %0 : !stream.resource<*>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%5, %7) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load @__constant_tensor_1x2x4xi32__size : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load @__constant_tensor_1x2x4xi32_0__size : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1) : (tensor<1x2x4xi32> in !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> tensor<1x2x4xi32> in %1{%__constant_tensor_1x2x4xi32_0__size} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32__size : index | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32_0__size : index | |
util.return | |
} | |
// -----// IR Dump After Inliner (inline) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_1x2x4xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32__size : index | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global private @__constant_tensor_1x2x4xi32_0__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32_0__size : index | |
util.return | |
} | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load @__constant_tensor_1x2x4xi32__size : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load @__constant_tensor_1x2x4xi32_0__size : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1) : (tensor<1x2x4xi32> in !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> tensor<1x2x4xi32> in %1{%__constant_tensor_1x2x4xi32_0__size} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32__size : index | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32__size : index | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32__size : index | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32__size : index | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32__size : index | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32_0__size : index | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32_0__size : index | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32_0__size : index | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32_0__size : index | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32_0__size : index | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load @__constant_tensor_1x2x4xi32__size : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load @__constant_tensor_1x2x4xi32_0__size : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1) : (tensor<1x2x4xi32> in !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> tensor<1x2x4xi32> in %1{%__constant_tensor_1x2x4xi32_0__size} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load @__constant_tensor_1x2x4xi32__size : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load @__constant_tensor_1x2x4xi32_0__size : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1) : (tensor<1x2x4xi32> in !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> tensor<1x2x4xi32> in %1{%__constant_tensor_1x2x4xi32_0__size} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load @__constant_tensor_1x2x4xi32__size : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load @__constant_tensor_1x2x4xi32_0__size : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1) : (tensor<1x2x4xi32> in !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> tensor<1x2x4xi32> in %1{%__constant_tensor_1x2x4xi32_0__size} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load @__constant_tensor_1x2x4xi32__size : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load @__constant_tensor_1x2x4xi32_0__size : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1) : (tensor<1x2x4xi32> in !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> tensor<1x2x4xi32> in %1{%__constant_tensor_1x2x4xi32_0__size} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load @__constant_tensor_1x2x4xi32__size : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load @__constant_tensor_1x2x4xi32_0__size : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1) : (tensor<1x2x4xi32> in !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> tensor<1x2x4xi32> in %1{%__constant_tensor_1x2x4xi32_0__size} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_1x2x4xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32__size : index | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global private @__constant_tensor_1x2x4xi32_0__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32_0__size : index | |
util.return | |
} | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load immutable @__constant_tensor_1x2x4xi32__size : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load immutable @__constant_tensor_1x2x4xi32_0__size : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1) : (tensor<1x2x4xi32> in !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> tensor<1x2x4xi32> in %1{%__constant_tensor_1x2x4xi32_0__size} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_1x2x4xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32__size : index | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global private @__constant_tensor_1x2x4xi32_0__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32_0__size : index | |
util.return | |
} | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load immutable @__constant_tensor_1x2x4xi32__size : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load immutable @__constant_tensor_1x2x4xi32_0__size : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1) : (tensor<1x2x4xi32> in !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> tensor<1x2x4xi32> in %1{%__constant_tensor_1x2x4xi32_0__size} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_1x2x4xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32__size : index | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global private @__constant_tensor_1x2x4xi32_0__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32_0__size : index | |
util.return | |
} | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load immutable @__constant_tensor_1x2x4xi32__size : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load immutable @__constant_tensor_1x2x4xi32_0__size : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1) : (tensor<1x2x4xi32> in !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> tensor<1x2x4xi32> in %1{%__constant_tensor_1x2x4xi32_0__size} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After CombineInitializersPass (iree-util-combine-initializers) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_1x2x4xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32__size : index | |
%cst_0 = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%1 = stream.resource.size %cst_0 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %1, @__constant_tensor_1x2x4xi32_0__size : index | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global private @__constant_tensor_1x2x4xi32_0__size : index | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load immutable @__constant_tensor_1x2x4xi32__size : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load immutable @__constant_tensor_1x2x4xi32_0__size : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1) : (tensor<1x2x4xi32> in !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> tensor<1x2x4xi32> in %1{%__constant_tensor_1x2x4xi32_0__size} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After SpecializeEncodingsPass (iree-stream-specialize-encodings) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_1x2x4xi32__size : index | |
util.initializer { | |
%cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%0 = stream.resource.size %cst : !stream.resource<constant> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32__size : index | |
%cst_0 = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1x2x4xi32> in !stream.resource<constant> = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
%1 = stream.resource.size %cst_0 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %1, @__constant_tensor_1x2x4xi32_0__size : index | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global private @__constant_tensor_1x2x4xi32_0__size : index | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load immutable @__constant_tensor_1x2x4xi32__size : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load immutable @__constant_tensor_1x2x4xi32_0__size : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1) : (tensor<1x2x4xi32> in !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> tensor<1x2x4xi32> in %1{%__constant_tensor_1x2x4xi32_0__size} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- // | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
// -----// IR Dump After EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- // | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %c32, @__constant_tensor_1x2x4xi32__size : index | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %c32, @__constant_tensor_1x2x4xi32_0__size : index | |
util.return | |
} | |
// -----// IR Dump After EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- // | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load immutable @__constant_tensor_1x2x4xi32__size : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load immutable @__constant_tensor_1x2x4xi32_0__size : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %__constant_tensor_1x2x4xi32_0__size for %__constant_tensor_1x2x4xi32_0__size]) : (!stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> %1{%__constant_tensor_1x2x4xi32_0__size} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After MaterializeEncodingsPass (iree-stream-materialize-encodings) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global private @__constant_tensor_1x2x4xi32__size : index | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %c32, @__constant_tensor_1x2x4xi32__size : index | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %c32, @__constant_tensor_1x2x4xi32_0__size : index | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global private @__constant_tensor_1x2x4xi32_0__size : index | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load immutable @__constant_tensor_1x2x4xi32__size : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load immutable @__constant_tensor_1x2x4xi32_0__size : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %__constant_tensor_1x2x4xi32_0__size for %__constant_tensor_1x2x4xi32_0__size]) : (!stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> %1{%__constant_tensor_1x2x4xi32_0__size} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %c32, @__constant_tensor_1x2x4xi32__size : index | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %c32, @__constant_tensor_1x2x4xi32_0__size : index | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %c32, @__constant_tensor_1x2x4xi32__size : index | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %c32, @__constant_tensor_1x2x4xi32_0__size : index | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %c32, @__constant_tensor_1x2x4xi32__size : index | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %c32, @__constant_tensor_1x2x4xi32_0__size : index | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %c32, @__constant_tensor_1x2x4xi32_0__size : index | |
util.global.store %c32, @__constant_tensor_1x2x4xi32__size : index | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %c32, @__constant_tensor_1x2x4xi32_0__size : index | |
util.global.store %c32, @__constant_tensor_1x2x4xi32__size : index | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load immutable @__constant_tensor_1x2x4xi32__size : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load immutable @__constant_tensor_1x2x4xi32_0__size : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %__constant_tensor_1x2x4xi32_0__size for %__constant_tensor_1x2x4xi32_0__size]) : (!stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> %1{%__constant_tensor_1x2x4xi32_0__size} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load immutable @__constant_tensor_1x2x4xi32__size : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load immutable @__constant_tensor_1x2x4xi32_0__size : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %__constant_tensor_1x2x4xi32_0__size for %__constant_tensor_1x2x4xi32_0__size]) : (!stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> %1{%__constant_tensor_1x2x4xi32_0__size} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load immutable @__constant_tensor_1x2x4xi32__size : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load immutable @__constant_tensor_1x2x4xi32_0__size : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %__constant_tensor_1x2x4xi32_0__size for %__constant_tensor_1x2x4xi32_0__size]) : (!stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> %1{%__constant_tensor_1x2x4xi32_0__size} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load immutable @__constant_tensor_1x2x4xi32__size : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load immutable @__constant_tensor_1x2x4xi32_0__size : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %__constant_tensor_1x2x4xi32_0__size for %__constant_tensor_1x2x4xi32_0__size]) : (!stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> %1{%__constant_tensor_1x2x4xi32_0__size} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__size = util.global.load immutable @__constant_tensor_1x2x4xi32__size : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__size = util.global.load immutable @__constant_tensor_1x2x4xi32_0__size : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %__constant_tensor_1x2x4xi32_0__size for %__constant_tensor_1x2x4xi32_0__size]) : (!stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size}) -> %1{%__constant_tensor_1x2x4xi32_0__size} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%__constant_tensor_1x2x4xi32_0__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32_0__size} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__size} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%__constant_tensor_1x2x4xi32__size} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %c32 for %c32]) : (!stream.resource<*>{%c32}) -> %1{%c32} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %c32 for %c32]) : (!stream.resource<*>{%c32}) -> %1{%c32} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %c32 for %c32]) : (!stream.resource<*>{%c32}) -> %1{%c32} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After VerifyLoweringToAsyncResourcesPass (iree-stream-verify-lowering-to-async-resources) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %c32 for %c32]) : (!stream.resource<*>{%c32}) -> %1{%c32} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- // | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- // | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %c32 for %c32]) : (!stream.resource<*>{%c32}) -> %1{%c32} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %c32 for %c32]) : (!stream.resource<*>{%c32}) -> %1{%c32} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %c32 for %c32]) : (!stream.resource<*>{%c32}) -> %1{%c32} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- // | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %c32 for %c32]) : (!stream.resource<*>{%c32}) -> %1{%c32} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- // | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<*> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %c32 for %c32]) : (!stream.resource<*>{%c32}) -> %1{%c32} | |
%3 = stream.async.transfer %2 : !stream.resource<*>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %3 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%5 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %5 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %6) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After RefineUsagePass (iree-stream-refine-usage) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %1{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%3, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %1{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%3, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %1{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%3, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %1{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%3, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %1{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%3, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %1{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%3, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %1{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%3, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %1{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%3, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %1{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%3, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After VerifyAsyncAccessRangesPass (iree-stream-verify-async-access-ranges) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%cst = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
util.global.store %cst, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %cst_0, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%0 = stream.async.transfer %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%2 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%1[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %1{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.async.transfer %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%3, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ScheduleExecutionPass (iree-stream-schedule-execution) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ScheduleExecutionPass (iree-stream-schedule-execution) //----- // | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c32}, !stream.resource<constant>{%c32}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
} => !stream.timepoint | |
%0:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
util.global.store %0#0, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0#1, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- // | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c32}, !stream.resource<constant>{%c32}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
} => !stream.timepoint | |
%0:2 = stream.timepoint.await %result_timepoint => %results#0, %results#1 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
util.global.store %0#0, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0#1, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After ScheduleExecutionPass (iree-stream-schedule-execution) //----- // | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c32}) -> !stream.resource<external>{%c32} { | |
%5 = stream.async.transfer %arg0 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %5 : !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%results_0:2, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c32}) -> (%1{%c32}, !stream.resource<external>{%c32}) { | |
%5 = stream.async.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %arg0{%c32} | |
%6 = stream.async.transfer %arg1 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %5, %6 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%2:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %3) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- // | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c32}) -> !stream.resource<external>{%c32} { | |
%5 = stream.async.transfer %arg0 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %5 : !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%results_0:2, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c32}) -> (%1{%c32}, !stream.resource<external>{%c32}) { | |
%5:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<external>{%c32}, %arg1 as %arg3: !stream.resource<constant>{%c32}) -> (!stream.resource<external>{%c32}, !stream.resource<external>{%c32}) { | |
%6 = stream.async.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg2[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %arg2{%c32} | |
%7 = stream.async.transfer %arg3 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %6, %7 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} | |
stream.yield %5#0, %5#1 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%2:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %3) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After SyncInitializersPass (iree-stream-sync-initializers) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c32}, !stream.resource<constant>{%c32}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
} => !stream.timepoint | |
%0:2 = stream.timepoint.await sync %result_timepoint => %results#0, %results#1 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
util.global.store %0#0, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0#1, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c32}) -> !stream.resource<external>{%c32} { | |
%5 = stream.async.transfer %arg0 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %5 : !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%results_0:2, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c32}) -> (%1{%c32}, !stream.resource<external>{%c32}) { | |
%5:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<external>{%c32}, %arg1 as %arg3: !stream.resource<constant>{%c32}) -> (!stream.resource<external>{%c32}, !stream.resource<external>{%c32}) { | |
%6 = stream.async.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg2[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %arg2{%c32} | |
%7 = stream.async.transfer %arg3 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %6, %7 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} | |
stream.yield %5#0, %5#1 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%2:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %3) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After PropagateTimepointsPass (iree-stream-propagate-timepoints) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private mutable @__constant_tensor_1x2x4xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c32}, !stream.resource<constant>{%c32}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
} => !stream.timepoint | |
%0:2 = stream.timepoint.await sync %result_timepoint => %results#0, %results#1 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
%1 = stream.timepoint.immediate => !stream.timepoint | |
util.global.store %1, @__constant_tensor_1x2x4xi32__timepoint : !stream.timepoint | |
util.global.store %0#0, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%2 = stream.timepoint.immediate => !stream.timepoint | |
util.global.store %2, @__constant_tensor_1x2x4xi32_0__timepoint : !stream.timepoint | |
util.global.store %0#1, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
util.global private mutable @__constant_tensor_1x2x4xi32_0__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%__constant_tensor_1x2x4xi32__timepoint = util.global.load @__constant_tensor_1x2x4xi32__timepoint : !stream.timepoint | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%0 = stream.timepoint.await %__constant_tensor_1x2x4xi32__timepoint => %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} | |
%__constant_tensor_1x2x4xi32_0__timepoint = util.global.load @__constant_tensor_1x2x4xi32_0__timepoint : !stream.timepoint | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%1 = stream.timepoint.await %__constant_tensor_1x2x4xi32_0__timepoint => %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_1x2x4xi32_0__timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c32}) -> !stream.resource<external>{%c32} { | |
%9 = stream.async.transfer %arg0 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %9 : !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c32} | |
%3 = util.optimization_barrier %2 : !stream.resource<external> | |
%4 = stream.timepoint.immediate => !stream.timepoint | |
%5 = stream.timepoint.join max(%4, %__constant_tensor_1x2x4xi32__timepoint) => !stream.timepoint | |
%results_0:2, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%5) => with(%3 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c32}) -> (%3{%c32}, !stream.resource<external>{%c32}) { | |
%9:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<external>{%c32}, %arg1 as %arg3: !stream.resource<constant>{%c32}) -> (!stream.resource<external>{%c32}, !stream.resource<external>{%c32}) { | |
%10 = stream.async.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg2[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %arg2{%c32} | |
%11 = stream.async.transfer %arg3 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %10, %11 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} | |
stream.yield %9#0, %9#1 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%6:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%8, %7) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After MaterializeBuiltinsPass (iree-stream-materialize-builtins) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private mutable @__constant_tensor_1x2x4xi32__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c32}, !stream.resource<constant>{%c32}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
} => !stream.timepoint | |
%0:2 = stream.timepoint.await sync %result_timepoint => %results#0, %results#1 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
%1 = stream.timepoint.immediate => !stream.timepoint | |
util.global.store %1, @__constant_tensor_1x2x4xi32__timepoint : !stream.timepoint | |
util.global.store %0#0, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%2 = stream.timepoint.immediate => !stream.timepoint | |
util.global.store %2, @__constant_tensor_1x2x4xi32_0__timepoint : !stream.timepoint | |
util.global.store %0#1, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
util.global private mutable @__constant_tensor_1x2x4xi32_0__timepoint = #stream.timepoint<immediate> : !stream.timepoint | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%__constant_tensor_1x2x4xi32__timepoint = util.global.load @__constant_tensor_1x2x4xi32__timepoint : !stream.timepoint | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%0 = stream.timepoint.await %__constant_tensor_1x2x4xi32__timepoint => %__constant_tensor_1x2x4xi32 : !stream.resource<constant>{%c32} | |
%__constant_tensor_1x2x4xi32_0__timepoint = util.global.load @__constant_tensor_1x2x4xi32_0__timepoint : !stream.timepoint | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%1 = stream.timepoint.await %__constant_tensor_1x2x4xi32_0__timepoint => %__constant_tensor_1x2x4xi32_0 : !stream.resource<constant>{%c32} | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_1x2x4xi32_0__timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c32}) -> !stream.resource<external>{%c32} { | |
%9 = stream.async.transfer %arg0 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %9 : !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c32} | |
%3 = util.optimization_barrier %2 : !stream.resource<external> | |
%4 = stream.timepoint.immediate => !stream.timepoint | |
%5 = stream.timepoint.join max(%4, %__constant_tensor_1x2x4xi32__timepoint) => !stream.timepoint | |
%results_0:2, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%5) => with(%3 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c32}) -> (%3{%c32}, !stream.resource<external>{%c32}) { | |
%9:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<external>{%c32}, %arg1 as %arg3: !stream.resource<constant>{%c32}) -> (!stream.resource<external>{%c32}, !stream.resource<external>{%c32}) { | |
%10 = stream.async.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg2[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %arg2{%c32} | |
%11 = stream.async.transfer %arg3 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %10, %11 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} | |
stream.yield %9#0, %9#1 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%6:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%8, %7) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%c32 = arith.constant 32 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c32}, !stream.resource<constant>{%c32}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
} => !stream.timepoint | |
%1:2 = stream.timepoint.await sync %result_timepoint => %results#0, %results#1 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
util.global.store %0, @__constant_tensor_1x2x4xi32__timepoint : !stream.timepoint | |
util.global.store %1#0, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32_0__timepoint : !stream.timepoint | |
util.global.store %1#1, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%c32 = arith.constant 32 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c32}, !stream.resource<constant>{%c32}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
} => !stream.timepoint | |
%1:2 = stream.timepoint.await sync %result_timepoint => %results#0, %results#1 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
util.global.store %0, @__constant_tensor_1x2x4xi32__timepoint : !stream.timepoint | |
util.global.store %1#0, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32_0__timepoint : !stream.timepoint | |
util.global.store %1#1, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.initializer { | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%c32 = arith.constant 32 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c32}, !stream.resource<constant>{%c32}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
} => !stream.timepoint | |
%1:2 = stream.timepoint.await sync %result_timepoint => %results#0, %results#1 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
util.global.store %0, @__constant_tensor_1x2x4xi32__timepoint : !stream.timepoint | |
util.global.store %1#0, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32_0__timepoint : !stream.timepoint | |
util.global.store %1#1, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%c32 = arith.constant 32 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c32}, !stream.resource<constant>{%c32}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
} => !stream.timepoint | |
%1:2 = stream.timepoint.await sync %result_timepoint => %results#0, %results#1 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
util.global.store %1#0, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %1#1, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32_0__timepoint : !stream.timepoint | |
util.global.store %0, @__constant_tensor_1x2x4xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.initializer { | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%c32 = arith.constant 32 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c32}, !stream.resource<constant>{%c32}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
} => !stream.timepoint | |
%1:2 = stream.timepoint.await sync %result_timepoint => %results#0, %results#1 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
util.global.store %1#0, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %1#1, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %0, @__constant_tensor_1x2x4xi32_0__timepoint : !stream.timepoint | |
util.global.store %0, @__constant_tensor_1x2x4xi32__timepoint : !stream.timepoint | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%__constant_tensor_1x2x4xi32__timepoint = util.global.load @__constant_tensor_1x2x4xi32__timepoint : !stream.timepoint | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__timepoint = util.global.load @__constant_tensor_1x2x4xi32_0__timepoint : !stream.timepoint | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_1x2x4xi32_0__timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c32}) -> !stream.resource<external>{%c32} { | |
%5 = stream.async.transfer %arg0 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %5 : !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%results_0:2, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_1x2x4xi32__timepoint) => with(%1 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c32}) -> (%1{%c32}, !stream.resource<external>{%c32}) { | |
%5:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<external>{%c32}, %arg1 as %arg3: !stream.resource<constant>{%c32}) -> (%arg0{%c32}, !stream.resource<external>{%c32}) { | |
%6 = stream.async.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg2[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %arg2{%c32} | |
%7 = stream.async.transfer %arg3 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %6, %7 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} | |
stream.yield %5#0, %5#1 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%2:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %3) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%__constant_tensor_1x2x4xi32__timepoint = util.global.load @__constant_tensor_1x2x4xi32__timepoint : !stream.timepoint | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__timepoint = util.global.load @__constant_tensor_1x2x4xi32_0__timepoint : !stream.timepoint | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_1x2x4xi32_0__timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c32}) -> !stream.resource<external>{%c32} { | |
%5 = stream.async.transfer %arg0 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %5 : !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%results_0:2, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_1x2x4xi32__timepoint) => with(%1 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c32}) -> (%1{%c32}, !stream.resource<external>{%c32}) { | |
%5:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<external>{%c32}, %arg1 as %arg3: !stream.resource<constant>{%c32}) -> (%arg0{%c32}, !stream.resource<external>{%c32}) { | |
%6 = stream.async.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg2[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %arg2{%c32} | |
%7 = stream.async.transfer %arg3 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %6, %7 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} | |
stream.yield %5#0, %5#1 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%2:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %3) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%__constant_tensor_1x2x4xi32__timepoint = util.global.load @__constant_tensor_1x2x4xi32__timepoint : !stream.timepoint | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__timepoint = util.global.load @__constant_tensor_1x2x4xi32_0__timepoint : !stream.timepoint | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_1x2x4xi32_0__timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c32}) -> !stream.resource<external>{%c32} { | |
%5 = stream.async.transfer %arg0 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %5 : !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%results_0:2, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_1x2x4xi32__timepoint) => with(%1 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c32}) -> (%1{%c32}, !stream.resource<external>{%c32}) { | |
%5:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<external>{%c32}, %arg1 as %arg3: !stream.resource<constant>{%c32}) -> (%arg0{%c32}, !stream.resource<external>{%c32}) { | |
%6 = stream.async.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg2[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %arg2{%c32} | |
%7 = stream.async.transfer %arg3 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %6, %7 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} | |
stream.yield %5#0, %5#1 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%2:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %3) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32__timepoint = util.global.load @__constant_tensor_1x2x4xi32__timepoint : !stream.timepoint | |
%__constant_tensor_1x2x4xi32_0__timepoint = util.global.load @__constant_tensor_1x2x4xi32_0__timepoint : !stream.timepoint | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_1x2x4xi32_0__timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c32}) -> !stream.resource<external>{%c32} { | |
%5 = stream.async.transfer %arg0 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %5 : !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%results_0:2, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_1x2x4xi32__timepoint) => with(%1 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c32}) -> (%1{%c32}, !stream.resource<external>{%c32}) { | |
%5:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<external>{%c32}, %arg1 as %arg3: !stream.resource<constant>{%c32}) -> (%arg0{%c32}, !stream.resource<external>{%c32}) { | |
%6 = stream.async.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg2[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %arg2{%c32} | |
%7 = stream.async.transfer %arg3 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %6, %7 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} | |
stream.yield %5#0, %5#1 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%2:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %3) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32__timepoint = util.global.load @__constant_tensor_1x2x4xi32__timepoint : !stream.timepoint | |
%__constant_tensor_1x2x4xi32_0__timepoint = util.global.load @__constant_tensor_1x2x4xi32_0__timepoint : !stream.timepoint | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_1x2x4xi32_0__timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c32}) -> !stream.resource<external>{%c32} { | |
%5 = stream.async.transfer %arg0 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %5 : !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%results_0:2, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%__constant_tensor_1x2x4xi32__timepoint) => with(%1 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c32}) -> (%1{%c32}, !stream.resource<external>{%c32}) { | |
%5:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<external>{%c32}, %arg1 as %arg3: !stream.resource<constant>{%c32}) -> (%arg0{%c32}, !stream.resource<external>{%c32}) { | |
%6 = stream.async.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg2[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %arg2{%c32} | |
%7 = stream.async.transfer %arg3 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %6, %7 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} | |
stream.yield %5#0, %5#1 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%2:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %3) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c32}, !stream.resource<constant>{%c32}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
} => !stream.timepoint | |
%0:2 = stream.timepoint.await sync %result_timepoint => %results#0, %results#1 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
util.global.store %0#0, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0#1, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c32}) -> !stream.resource<external>{%c32} { | |
%5 = stream.async.transfer %arg0 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %5 : !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%results_0:2, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c32}) -> (%1{%c32}, !stream.resource<external>{%c32}) { | |
%5:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<external>{%c32}, %arg1 as %arg3: !stream.resource<constant>{%c32}) -> (%arg0{%c32}, !stream.resource<external>{%c32}) { | |
%6 = stream.async.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg2[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %arg2{%c32} | |
%7 = stream.async.transfer %arg3 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %6, %7 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} | |
stream.yield %5#0, %5#1 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%2:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %3) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c32}, !stream.resource<constant>{%c32}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
} => !stream.timepoint | |
%0:2 = stream.timepoint.await sync %result_timepoint => %results#0, %results#1 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
util.global.store %0#0, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0#1, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c32}) -> !stream.resource<external>{%c32} { | |
%5 = stream.async.transfer %arg0 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %5 : !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%results_0:2, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c32}) -> (%1{%c32}, !stream.resource<external>{%c32}) { | |
%5:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<external>{%c32}, %arg1 as %arg3: !stream.resource<constant>{%c32}) -> (%arg0{%c32}, !stream.resource<external>{%c32}) { | |
%6 = stream.async.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg2[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %arg2{%c32} | |
%7 = stream.async.transfer %arg3 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %6, %7 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} | |
stream.yield %5#0, %5#1 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%2:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %3) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c32}, !stream.resource<constant>{%c32}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
} => !stream.timepoint | |
%0:2 = stream.timepoint.await sync %result_timepoint => %results#0, %results#1 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
util.global.store %0#0, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0#1, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c32}) -> !stream.resource<external>{%c32} { | |
%5 = stream.async.transfer %arg0 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %5 : !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%results_0:2, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c32}) -> (%1{%c32}, !stream.resource<external>{%c32}) { | |
%5:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<external>{%c32}, %arg1 as %arg3: !stream.resource<constant>{%c32}) -> (%arg0{%c32}, !stream.resource<external>{%c32}) { | |
%6 = stream.async.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg2[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %arg2{%c32} | |
%7 = stream.async.transfer %arg3 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %6, %7 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} | |
stream.yield %5#0, %5#1 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%2:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %3) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After VerifyLoweringToAsyncPass (iree-stream-verify-lowering-to-async) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> (!stream.resource<constant>{%c32}, !stream.resource<constant>{%c32}) { | |
%cst = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32> | |
%cst_0 = stream.async.constant : !stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
stream.yield %cst, %cst_0 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
} => !stream.timepoint | |
%0:2 = stream.timepoint.await sync %result_timepoint => %results#0, %results#1 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
util.global.store %0#0, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %0#1, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c32}) -> !stream.resource<external>{%c32} { | |
%5 = stream.async.transfer %arg0 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %5 : !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c32} | |
%1 = util.optimization_barrier %0 : !stream.resource<external> | |
%results_0:2, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c32}) -> (%1{%c32}, !stream.resource<external>{%c32}) { | |
%5:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<external>{%c32}, %arg1 as %arg3: !stream.resource<constant>{%c32}) -> (%arg0{%c32}, !stream.resource<external>{%c32}) { | |
%6 = stream.async.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg2[%c0 to %c32 for %c32]) : (!stream.resource<external>{%c32}) -> %arg2{%c32} | |
%7 = stream.async.transfer %arg3 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
stream.yield %6, %7 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} | |
stream.yield %5#0, %5#1 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%2:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%3 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%4 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %2#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%4, %3) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ScheduleAllocationPass (iree-stream-schedule-allocation) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%results:2, %result_timepoint = stream.resource.constants on(#hal.device.affinity<@__device_0>) : | |
!stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
!stream.resource<constant>{%c32} = dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32> | |
=> !stream.timepoint | |
%0 = stream.cmd.execute once on(#hal.device.affinity<@__device_0>) with() { | |
} => !stream.timepoint | |
%1 = stream.timepoint.join max(%result_timepoint, %0) => !stream.timepoint | |
%2:2 = stream.timepoint.await sync %1 => %results#0, %results#1 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
util.global.store %2#0, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %2#1, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%c0_0 = arith.constant 0 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c32}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c0_0], %arg1[%c0_0], %c32 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%c0_1 = arith.constant 0 : index | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c32}, %result_2 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0_1], %arg2[%c0_1], %c32 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_2, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After PackConstantsPass (iree-stream-pack-constants) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After LayoutSlicesPass (iree-stream-layout-slices) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After PackConstantsPass (iree-stream-pack-constants) //----- // | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%c0_i64 = arith.constant 0 : i64 | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%7 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%8 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %7[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %8, %7 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.resource.subview %1#1[%c0] : !stream.resource<constant>{%c128} -> !stream.resource<constant>{%c32} | |
%c64 = arith.constant 64 : index | |
%3 = stream.resource.subview %1#1[%c64] : !stream.resource<constant>{%c128} -> !stream.resource<constant>{%c32} | |
%4 = stream.cmd.execute once on(#hal.device.affinity<@__device_0>) with() { | |
} => !stream.timepoint | |
%5 = stream.timepoint.join max(%1#0, %4) => !stream.timepoint | |
%6:2 = stream.timepoint.await sync %5 => %2, %3 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
util.global.store %6#0, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %6#1, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After LayoutSlicesPass (iree-stream-layout-slices) //----- // | |
util.initializer { | |
%c32 = arith.constant 32 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%c128 = arith.constant 128 : index | |
%c0 = arith.constant 0 : index | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%c0_i64 = arith.constant 0 : i64 | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%7 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%8 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %7[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %8, %7 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.resource.subview %1#1[%c0] : !stream.resource<constant>{%c128} -> !stream.resource<constant>{%c32} | |
%c64 = arith.constant 64 : index | |
%3 = stream.resource.subview %1#1[%c64] : !stream.resource<constant>{%c128} -> !stream.resource<constant>{%c32} | |
%4 = stream.cmd.execute once on(#hal.device.affinity<@__device_0>) with() { | |
} => !stream.timepoint | |
%5 = stream.timepoint.join max(%1#0, %4) => !stream.timepoint | |
%6:2 = stream.timepoint.await sync %5 => %2, %3 : !stream.resource<constant>{%c32}, !stream.resource<constant>{%c32} | |
util.global.store %6#0, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %6#1, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c64 = arith.constant 64 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%5 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%6 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %5[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %6, %5 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.timepoint.await sync %1#0 => %1#1 : !stream.resource<constant>{%c128} | |
%3 = stream.resource.subview %2[%c0] : !stream.resource<constant>{%c128} -> !stream.resource<constant>{%c32} | |
%4 = stream.resource.subview %2[%c64] : !stream.resource<constant>{%c128} -> !stream.resource<constant>{%c32} | |
util.global.store %3, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %4, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After PackConstantsPass (iree-stream-pack-constants) //----- // | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%c0_0 = arith.constant 0 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c32}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c0_0], %arg1[%c0_0], %c32 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%c0_1 = arith.constant 0 : index | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c32}, %result_2 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0_1], %arg2[%c0_1], %c32 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_2, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After LayoutSlicesPass (iree-stream-layout-slices) //----- // | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%c0_0 = arith.constant 0 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c32}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c0_0], %arg1[%c0_0], %c32 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%c0_1 = arith.constant 0 : index | |
%result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c32}, %result_2 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0_1], %arg2[%c0_1], %c32 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_2, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c32}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c32 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c32}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After PropagateSubrangesPass (iree-util-propagate-subranges) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global private mutable @__constant_tensor_1x2x4xi32__storage_size : index | |
util.global private mutable @__constant_tensor_1x2x4xi32__offset : index | |
util.global private mutable @__constant_tensor_1x2x4xi32__length : index | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c0_0 = arith.constant 0 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0_0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%5 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0_0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%6 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %5[%c0_0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %6, %5 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.timepoint.await sync %1#0 => %1#1 : !stream.resource<constant>{%c128} | |
%3 = stream.resource.subview %2[%c0_0] : !stream.resource<constant>{%c128} -> !stream.resource<constant>{%c32} | |
%4 = stream.resource.subview %2[%c64] : !stream.resource<constant>{%c128} -> !stream.resource<constant>{%c32} | |
util.global.store %2, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %c128, @__constant_tensor_1x2x4xi32__storage_size : index | |
util.global.store %c0_0, @__constant_tensor_1x2x4xi32__offset : index | |
util.global.store %c32, @__constant_tensor_1x2x4xi32__length : index | |
util.global.store %2, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %c128, @__constant_tensor_1x2x4xi32_0__storage_size : index | |
util.global.store %c64, @__constant_tensor_1x2x4xi32_0__offset : index | |
util.global.store %c32, @__constant_tensor_1x2x4xi32_0__length : index | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global private mutable @__constant_tensor_1x2x4xi32_0__storage_size : index | |
util.global private mutable @__constant_tensor_1x2x4xi32_0__offset : index | |
util.global private mutable @__constant_tensor_1x2x4xi32_0__length : index | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__storage_size = util.global.load @__constant_tensor_1x2x4xi32__storage_size : index | |
%__constant_tensor_1x2x4xi32__offset = util.global.load @__constant_tensor_1x2x4xi32__offset : index | |
%__constant_tensor_1x2x4xi32__length = util.global.load @__constant_tensor_1x2x4xi32__length : index | |
%0 = stream.resource.subview %__constant_tensor_1x2x4xi32[%__constant_tensor_1x2x4xi32__offset] : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__storage_size} -> !stream.resource<constant>{%__constant_tensor_1x2x4xi32__length} | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__storage_size = util.global.load @__constant_tensor_1x2x4xi32_0__storage_size : index | |
%__constant_tensor_1x2x4xi32_0__offset = util.global.load @__constant_tensor_1x2x4xi32_0__offset : index | |
%__constant_tensor_1x2x4xi32_0__length = util.global.load @__constant_tensor_1x2x4xi32_0__length : index | |
%1 = stream.resource.subview %__constant_tensor_1x2x4xi32_0[%__constant_tensor_1x2x4xi32_0__offset] : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__storage_size} -> !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__length} | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%2 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%1 as %arg0: !stream.resource<constant>{%c32}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c0], %arg1[%c0], %c32 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c32} | |
%4 = util.optimization_barrier %3 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%5 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%4 as %arg0: !stream.resource<external>{%c32}, %0 as %arg1: !stream.resource<constant>{%c32}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c32} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%6:2 = stream.timepoint.await %5 => %result_0, %4 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%8, %7) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%3 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%4 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %3[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %4, %3 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.timepoint.await sync %1#0 => %1#1 : !stream.resource<constant>{%c128} | |
util.global.store %2, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %c128, @__constant_tensor_1x2x4xi32__storage_size : index | |
util.global.store %c0, @__constant_tensor_1x2x4xi32__offset : index | |
util.global.store %c32, @__constant_tensor_1x2x4xi32__length : index | |
util.global.store %2, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %c128, @__constant_tensor_1x2x4xi32_0__storage_size : index | |
util.global.store %c64, @__constant_tensor_1x2x4xi32_0__offset : index | |
util.global.store %c32, @__constant_tensor_1x2x4xi32_0__length : index | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%3 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%4 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %3[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %4, %3 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.timepoint.await sync %1#0 => %1#1 : !stream.resource<constant>{%c128} | |
util.global.store %2, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %c128, @__constant_tensor_1x2x4xi32__storage_size : index | |
util.global.store %c0, @__constant_tensor_1x2x4xi32__offset : index | |
util.global.store %c32, @__constant_tensor_1x2x4xi32__length : index | |
util.global.store %2, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %c128, @__constant_tensor_1x2x4xi32_0__storage_size : index | |
util.global.store %c64, @__constant_tensor_1x2x4xi32_0__offset : index | |
util.global.store %c32, @__constant_tensor_1x2x4xi32_0__length : index | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%3 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%4 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %3[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %4, %3 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.timepoint.await sync %1#0 => %1#1 : !stream.resource<constant>{%c128} | |
util.global.store %2, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %c128, @__constant_tensor_1x2x4xi32__storage_size : index | |
util.global.store %c0, @__constant_tensor_1x2x4xi32__offset : index | |
util.global.store %c32, @__constant_tensor_1x2x4xi32__length : index | |
util.global.store %2, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %c128, @__constant_tensor_1x2x4xi32_0__storage_size : index | |
util.global.store %c64, @__constant_tensor_1x2x4xi32_0__offset : index | |
util.global.store %c32, @__constant_tensor_1x2x4xi32_0__length : index | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%3 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%4 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %3[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %4, %3 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.timepoint.await sync %1#0 => %1#1 : !stream.resource<constant>{%c128} | |
util.global.store %2, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %2, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %c32, @__constant_tensor_1x2x4xi32_0__length : index | |
util.global.store %c64, @__constant_tensor_1x2x4xi32_0__offset : index | |
util.global.store %c128, @__constant_tensor_1x2x4xi32_0__storage_size : index | |
util.global.store %c32, @__constant_tensor_1x2x4xi32__length : index | |
util.global.store %c0, @__constant_tensor_1x2x4xi32__offset : index | |
util.global.store %c128, @__constant_tensor_1x2x4xi32__storage_size : index | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%3 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%4 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %3[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %4, %3 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.timepoint.await sync %1#0 => %1#1 : !stream.resource<constant>{%c128} | |
util.global.store %2, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %2, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.global.store %c32, @__constant_tensor_1x2x4xi32_0__length : index | |
util.global.store %c64, @__constant_tensor_1x2x4xi32_0__offset : index | |
util.global.store %c128, @__constant_tensor_1x2x4xi32_0__storage_size : index | |
util.global.store %c32, @__constant_tensor_1x2x4xi32__length : index | |
util.global.store %c0, @__constant_tensor_1x2x4xi32__offset : index | |
util.global.store %c128, @__constant_tensor_1x2x4xi32__storage_size : index | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__storage_size = util.global.load @__constant_tensor_1x2x4xi32__storage_size : index | |
%__constant_tensor_1x2x4xi32__offset = util.global.load @__constant_tensor_1x2x4xi32__offset : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__storage_size = util.global.load @__constant_tensor_1x2x4xi32_0__storage_size : index | |
%__constant_tensor_1x2x4xi32_0__offset = util.global.load @__constant_tensor_1x2x4xi32_0__offset : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__storage_size}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%__constant_tensor_1x2x4xi32_0__offset], %arg1[%c0], %c32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__storage_size} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%__constant_tensor_1x2x4xi32__storage_size}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%__constant_tensor_1x2x4xi32__offset], %arg2[%c0], %c32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__storage_size} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__storage_size = util.global.load @__constant_tensor_1x2x4xi32__storage_size : index | |
%__constant_tensor_1x2x4xi32__offset = util.global.load @__constant_tensor_1x2x4xi32__offset : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__storage_size = util.global.load @__constant_tensor_1x2x4xi32_0__storage_size : index | |
%__constant_tensor_1x2x4xi32_0__offset = util.global.load @__constant_tensor_1x2x4xi32_0__offset : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__storage_size}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%__constant_tensor_1x2x4xi32_0__offset], %arg1[%c0], %c32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__storage_size} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%__constant_tensor_1x2x4xi32__storage_size}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%__constant_tensor_1x2x4xi32__offset], %arg2[%c0], %c32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__storage_size} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func private @_sort3D() { | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32__storage_size = util.global.load @__constant_tensor_1x2x4xi32__storage_size : index | |
%__constant_tensor_1x2x4xi32__offset = util.global.load @__constant_tensor_1x2x4xi32__offset : index | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0__storage_size = util.global.load @__constant_tensor_1x2x4xi32_0__storage_size : index | |
%__constant_tensor_1x2x4xi32_0__offset = util.global.load @__constant_tensor_1x2x4xi32_0__offset : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__storage_size}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%__constant_tensor_1x2x4xi32_0__offset], %arg1[%c0], %c32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__storage_size} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%__constant_tensor_1x2x4xi32__storage_size}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%__constant_tensor_1x2x4xi32__offset], %arg2[%c0], %c32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__storage_size} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32__storage_size = util.global.load @__constant_tensor_1x2x4xi32__storage_size : index | |
%__constant_tensor_1x2x4xi32__offset = util.global.load @__constant_tensor_1x2x4xi32__offset : index | |
%__constant_tensor_1x2x4xi32_0__storage_size = util.global.load @__constant_tensor_1x2x4xi32_0__storage_size : index | |
%__constant_tensor_1x2x4xi32_0__offset = util.global.load @__constant_tensor_1x2x4xi32_0__offset : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__storage_size}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%__constant_tensor_1x2x4xi32_0__offset], %arg1[%c0], %c32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__storage_size} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%__constant_tensor_1x2x4xi32__storage_size}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%__constant_tensor_1x2x4xi32__offset], %arg2[%c0], %c32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__storage_size} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%__constant_tensor_1x2x4xi32__storage_size = util.global.load @__constant_tensor_1x2x4xi32__storage_size : index | |
%__constant_tensor_1x2x4xi32__offset = util.global.load @__constant_tensor_1x2x4xi32__offset : index | |
%__constant_tensor_1x2x4xi32_0__storage_size = util.global.load @__constant_tensor_1x2x4xi32_0__storage_size : index | |
%__constant_tensor_1x2x4xi32_0__offset = util.global.load @__constant_tensor_1x2x4xi32_0__offset : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__storage_size}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%__constant_tensor_1x2x4xi32_0__offset], %arg1[%c0], %c32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32_0__storage_size} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%__constant_tensor_1x2x4xi32__storage_size}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%__constant_tensor_1x2x4xi32__offset], %arg2[%c0], %c32 : !stream.resource<constant>{%__constant_tensor_1x2x4xi32__storage_size} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%3 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%4 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %3[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %4, %3 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.timepoint.await sync %1#0 => %1#1 : !stream.resource<constant>{%c128} | |
util.global.store %2, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.global.store %2, @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.return | |
} | |
util.global private @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32_0 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%3 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%4 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %3[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %4, %3 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.timepoint.await sync %1#0 => %1#1 : !stream.resource<constant>{%c128} | |
util.global.store %2, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_1, %result_timepoint_2 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_2) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_1 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_1, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%3 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%4 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %3[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %4, %3 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.timepoint.await sync %1#0 => %1#1 : !stream.resource<constant>{%c128} | |
util.global.store %2, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_1, %result_timepoint_2 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_2) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_1 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_1, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After VerifyLoweringToCmdPass (iree-stream-verify-lowering-to-cmd) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%3 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%4 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %3[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %4, %3 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.timepoint.await sync %1#0 => %1#1 : !stream.resource<constant>{%c128} | |
util.global.store %2, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_1, %result_timepoint_2 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_2) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_1 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_1, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%3 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%4 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %3[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %4, %3 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.timepoint.await sync %1#0 => %1#1 : !stream.resource<constant>{%c128} | |
util.global.store %2, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%3 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%4 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %3[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %4, %3 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.timepoint.await sync %1#0 => %1#1 : !stream.resource<constant>{%c128} | |
util.global.store %2, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%3 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%4 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %3[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %4, %3 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.timepoint.await sync %1#0 => %1#1 : !stream.resource<constant>{%c128} | |
util.global.store %2, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%3 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%4 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %3[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %4, %3 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.timepoint.await sync %1#0 => %1#1 : !stream.resource<constant>{%c128} | |
util.global.store %2, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%3 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%4 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %3[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %4, %3 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.timepoint.await sync %1#0 => %1#1 : !stream.resource<constant>{%c128} | |
util.global.store %2, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%__constant_tensor_1x2x4xi32_0 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32_0 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_1, %result_timepoint_2 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_2) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_1 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_1, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%3 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%4 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %3[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %4, %3 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.timepoint.await sync %1#0 => %1#1 : !stream.resource<constant>{%c128} | |
util.global.store %2, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%3 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%4 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %3[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %4, %3 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.timepoint.await sync %1#0 => %1#1 : !stream.resource<constant>{%c128} | |
util.global.store %2, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
%1:2 = scf.if %did_map -> (!stream.timepoint, !stream.resource<constant>) { | |
scf.yield %0, %result : !stream.timepoint, !stream.resource<constant> | |
} else { | |
%3 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%4 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %3[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
scf.yield %4, %3 : !stream.timepoint, !stream.resource<constant> | |
} | |
%2 = stream.timepoint.await sync %1#0 => %1#1 : !stream.resource<constant>{%c128} | |
util.global.store %2, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb1, ^bb2 | |
^bb1: // pred: ^bb0 | |
cf.br ^bb3(%0, %result : !stream.timepoint, !stream.resource<constant>) | |
^bb2: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb3(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb3(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb1, ^bb2 | |
cf.br ^bb4 | |
^bb4: // pred: ^bb3 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After SCFToControlFlowPass (convert-scf-to-cf) //----- // | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ElideTimepointsPass (iree-stream-elide-timepoints) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseDispatchBindingsPass (iree-stream-fuse-dispatch-bindings) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding, %arg1: index) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%arg1] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%3 = arith.cmpi slt, %arg2, %arg3 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%c0_2 = arith.constant 0 : index | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%c0 : index) { | |
rw %arg0[%c0_2 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After AnnotateDispatchArgumentsPass (iree-stream-annotate-dispatch-arguments) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: index {stream.values = [0 : index]}) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%arg1] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%3 = arith.cmpi slt, %arg2, %arg3 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%c0_2 = arith.constant 0 : index | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%c0 : index) { | |
rw %arg0[%c0_2 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After AnnotateDispatchAssumptionsPass (iree-stream-annotate-dispatch-assumptions) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: index {stream.values = [0 : index]}) { | |
%0 = util.assume.int %arg1<umin = 0, umax = 0> : index | |
%c0 = arith.constant 0 : index | |
%1 = stream.binding.subspan %arg0[%0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%2 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%3 = iree_linalg_ext.sort dimension(2) outs(%2 : tensor<1x2x4xi32>) { | |
^bb0(%arg2: i32, %arg3: i32): | |
%4 = arith.cmpi slt, %arg2, %arg3 : i32 | |
iree_linalg_ext.yield %4 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %3, %1, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%c0_2 = arith.constant 0 : index | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%c0 : index) { | |
rw %arg0[%c0_2 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After PackDispatchOperandsPass (iree-stream-pack-dispatch-operands) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: i32, %arg2: i32) { | |
%0 = arith.extui %arg1 : i32 to i64 | |
%1 = arith.extui %arg2 : i32 to i64 | |
%c32_i64 = arith.constant 32 : i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index | |
%5 = util.assume.int %4<umin = 0, umax = 0> : index | |
%c0 = arith.constant 0 : index | |
%6 = stream.binding.subspan %arg0[%5] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%7 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%8 = iree_linalg_ext.sort dimension(2) outs(%7 : tensor<1x2x4xi32>) { | |
^bb0(%arg3: i32, %arg4: i32): | |
%9 = arith.cmpi slt, %arg3, %arg4 : i32 | |
iree_linalg_ext.yield %9 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %8, %6, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%c0_2 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c0_i32 = arith.constant 0 : i32 | |
%c32_i64 = arith.constant 32 : i64 | |
%c0_i64_3 = arith.constant 0 : i64 | |
%c0_i32_4 = arith.constant 0 : i32 | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%c0_i32, %c0_i32_4 : i32, i32) { | |
rw %arg0[%c0_2 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_sort3D() { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%c0_i32, %c0_i32 : i32, i32) { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%c0_i32, %c0_i32 : i32, i32) { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func private @_sort3D() { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%c0_i32, %c0_i32 : i32, i32) { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%c0_i32 = arith.constant 0 : i32 | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%c0_i32, %c0_i32 : i32, i32) { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%c0_i32, %c0_i32 : i32, i32) { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: i32, %arg2: i32) { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = arith.extui %arg1 : i32 to i64 | |
%1 = arith.extui %arg2 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index | |
%5 = util.assume.int %4<umin = 0, umax = 0> : index | |
%6 = stream.binding.subspan %arg0[%5] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%7 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%8 = iree_linalg_ext.sort dimension(2) outs(%7 : tensor<1x2x4xi32>) { | |
^bb0(%arg3: i32, %arg4: i32): | |
%9 = arith.cmpi slt, %arg3, %arg4 : i32 | |
iree_linalg_ext.yield %9 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %8, %6, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%c0_i32, %c0_i32 : i32, i32) { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: i32, %arg2: i32) { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = arith.extui %arg1 : i32 to i64 | |
%1 = arith.extui %arg2 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index | |
%5 = util.assume.int %4<umin = 0, umax = 0> : index | |
%6 = stream.binding.subspan %arg0[%5] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%7 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%8 = iree_linalg_ext.sort dimension(2) outs(%7 : tensor<1x2x4xi32>) { | |
^bb0(%arg3: i32, %arg4: i32): | |
%9 = arith.cmpi slt, %arg3, %arg4 : i32 | |
iree_linalg_ext.yield %9 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %8, %6, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%c0_i32, %c0_i32 : i32, i32) { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: i32, %arg2: i32) { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = arith.extui %arg1 : i32 to i64 | |
%1 = arith.extui %arg2 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index | |
%5 = util.assume.int %4<umin = 0, umax = 0> : index | |
%6 = stream.binding.subspan %arg0[%5] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%7 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%8 = iree_linalg_ext.sort dimension(2) outs(%7 : tensor<1x2x4xi32>) { | |
^bb0(%arg3: i32, %arg4: i32): | |
%9 = arith.cmpi slt, %arg3, %arg4 : i32 | |
iree_linalg_ext.yield %9 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %8, %6, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%c0_i32, %c0_i32 : i32, i32) { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FoldUniformOperandsPass (iree-stream-fold-uniform-operands) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}) { | |
%c0_i32 = arith.constant 0 : i32 | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = arith.extui %c0_i32 : i32 to i64 | |
%1 = arith.extui %c0_i32 : i32 to i64 | |
%2 = arith.shli %1, %c32_i64 : i64 | |
%3 = arith.ori %0, %2 : i64 | |
%4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index | |
%5 = util.assume.int %4<umin = 0, umax = 0> : index | |
%6 = stream.binding.subspan %arg0[%5] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%7 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%8 = iree_linalg_ext.sort dimension(2) outs(%7 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%9 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %9 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %8, %6, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func private @_sort3D() { | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func private @_sort3D() { | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%c0 = arith.constant 0 : index | |
%c32 = arith.constant 32 : index | |
%c128 = arith.constant 128 : index | |
%c64 = arith.constant 64 : index | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
stream.executable private @_sort3D_dispatch_0 { | |
stream.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
stream.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store(%arg0: !stream.binding {stream.alignment = 64 : index}) { | |
%c0 = arith.constant 0 : index | |
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg1: i32, %arg2: i32): | |
%3 = arith.cmpi slt, %arg1, %arg2 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After MaterializeInterfacesPass (iree-hal-materialize-interfaces) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
hal.executable private @_sort3D_dispatch_0 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@rocm_hsaco_fb::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- // | |
#composite_of_128b = #util.composite<128xi8, [ | |
dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>, | |
dense<0> : vector<32xi8>, | |
]> | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @sort3D() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @sort3D() -> ()"}} { | |
util.call @_sort3D() : () -> () | |
util.return | |
} | |
hal.executable private @_sort3D_dispatch_0 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
} | |
util.global private @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.initializer { | |
%c0 = arith.constant 0 : index | |
%c0_i64 = arith.constant 0 : i64 | |
%c128 = arith.constant 128 : index | |
%0 = stream.timepoint.immediate => !stream.timepoint | |
%buffer_cst = util.buffer.constant {alignment = 64 : index} : !util.buffer = #composite_of_128b | |
%did_map, %result = stream.resource.try_map on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128} | |
cf.cond_br %did_map, ^bb2(%0, %result : !stream.timepoint, !stream.resource<constant>), ^bb1 | |
^bb1: // pred: ^bb0 | |
%1 = stream.resource.alloc uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<constant>{%c128} | |
%file = stream.file.constant on(#hal.device.affinity<@__device_0>) %buffer_cst[%c0 for %c128] : !util.buffer{%c128} -> !stream.file | |
%2 = stream.file.read on(#hal.device.affinity<@__device_0>) await(%0) => %file[%c0_i64], %1[%c0], %c128 : !stream.file -> !stream.resource<constant>{%c128} => !stream.timepoint | |
cf.br ^bb2(%2, %1 : !stream.timepoint, !stream.resource<constant>) | |
^bb2(%3: !stream.timepoint, %4: !stream.resource<constant>): // 2 preds: ^bb0, ^bb1 | |
%5 = stream.timepoint.await sync %3 => %4 : !stream.resource<constant>{%c128} | |
util.global.store %5, @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
util.return | |
} | |
util.func private @_sort3D() { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c32 = arith.constant 32 : index | |
%c0 = arith.constant 0 : index | |
%__constant_tensor_1x2x4xi32 = util.global.load immutable @__constant_tensor_1x2x4xi32 : !stream.resource<constant> | |
%result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%__constant_tensor_1x2x4xi32 as %arg0: !stream.resource<constant>{%c128}, %result as %arg1: !stream.resource<external>{%c32}) { | |
stream.cmd.copy %arg0[%c64], %arg1[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} => !stream.timepoint | |
%1 = stream.timepoint.await %0 => %result : !stream.resource<external>{%c32} | |
%2 = util.optimization_barrier %1 : !stream.resource<external> | |
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c32} => !stream.timepoint | |
%3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%2 as %arg0: !stream.resource<external>{%c32}, %__constant_tensor_1x2x4xi32 as %arg1: !stream.resource<constant>{%c128}, %result_0 as %arg2: !stream.resource<external>{%c32}) { | |
stream.cmd.concurrent { | |
stream.cmd.dispatch @_sort3D_dispatch_0::@rocm_hsaco_fb::@_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store { | |
rw %arg0[%c0 for %c32] : !stream.resource<external>{%c32} | |
} | |
stream.cmd.copy %arg1[%c0], %arg2[%c0], %c32 : !stream.resource<constant>{%c128} -> !stream.resource<external>{%c32} | |
} | |
} => !stream.timepoint | |
%4:2 = stream.timepoint.await %3 => %result_0, %2 : !stream.resource<external>{%c32}, !stream.resource<external>{%c32} | |
%5 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#0 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
%6 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %4#1 : tensor<1x2x4xi32> in !stream.resource<external>{%c32} -> tensor<1x2x4xi32> | |
check.expect_eq(%6, %5) : tensor<1x2x4xi32> | |
util.return | |
} | |
} | |
// -----// IR Dump After MaterializeDeviceEncodingPass (iree-codegen-materialize-device-encoding) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
// -----// IR Dump After MaterializeEncodingIntoPaddingPass (iree-codegen-materialize-encoding-into-padding) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
// -----// IR Dump After GPUGeneralizeNamedOpsPass (iree-codegen-gpu-generalize-named-ops) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
// -----// IR Dump After ROCDLConfigureBufferInstructionsPass (iree-rocdl-configure-buffer-instructions) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
// -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
// -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
// -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
// -----// IR Dump After BlockDynamicDimensionsPass (iree-codegen-block-dynamic-dimensions) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
// -----// IR Dump After MaterializeTuningSpecsPass (iree-codegen-materialize-tuning-specs) //----- // | |
module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
// -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- // | |
module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
// -----// IR Dump After LLVMGPUSelectLoweringStrategyPass (iree-llvmgpu-select-lowering-strategy) //----- // | |
module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
// -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- // | |
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>) { | |
hal.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
// -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- // | |
hal.executable private @_sort3D_dispatch_0 { | |
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>) { | |
hal.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
} | |
// -----// IR Dump After HoistExecutableObjectsPass (iree-hal-hoist-executable-objects) //----- // | |
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>) { | |
hal.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
} | |
// -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- // | |
module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
} | |
// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
// -----// IR Dump After ROCDLConfigureBufferInstructionsPass (iree-rocdl-configure-buffer-instructions) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
// -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
// -----// IR Dump After EmptyTensorToAllocTensorPass (empty-tensor-to-alloc-tensor) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
%1 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> -> tensor<1x2x4xi32> | |
%2 = iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : tensor<1x2x4xi32>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%3 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %3 : i1 | |
} -> tensor<1x2x4xi32> | |
flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [1, 2, 4], strides = [1, 1, 1] : tensor<1x2x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<1x2x4xi32>> | |
return | |
} | |
// -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} | |
memref.copy %1, %1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
return | |
} | |
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} | |
memref.copy %1, %1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} | |
return | |
} | |
// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} | |
return | |
} | |
// -----// IR Dump After LLVMGPUTileAndDistributePass (iree-llvmgpu-tile-and-distribute) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} | |
return | |
} | |
// -----// IR Dump After PropagateDispatchSizeBoundsPass (iree-codegen-propagate-dispatch-size-bounds) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} | |
return | |
} | |
// -----// IR Dump After RemoveSingleIterationLoopPass (iree-codegen-remove-single-iteration-loop) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} | |
return | |
} | |
// -----// IR Dump After LLVMGPULowerExecutableTargetPass (iree-llvmgpu-lower-executable-target) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} | |
return | |
} | |
// -----// IR Dump After VerifyWorkgroupDistributionPass (iree-codegen-verify-workgroup-distribution) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUDistribute workgroup_size = [128, 1, 1]>} { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
iree_linalg_ext.sort {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 128]]>} dimension(2) outs(%1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} | |
return | |
} | |
// -----// IR Dump After ReconcileTranslationInfoPass (iree-codegen-reconcile-translation-info) //----- // | |
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>) { | |
hal.executable.export public @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) attributes {workgroup_size = [128 : index, 1 : index, 1 : index]} { | |
^bb0(%arg0: !hal.device): | |
%c1 = arith.constant 1 : index | |
%c1_0 = arith.constant 1 : index | |
%c1_1 = arith.constant 1 : index | |
hal.return %c1, %c1_0, %c1_1 : index, index, index | |
} | |
builtin.module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #hal.descriptor_type<storage_buffer>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
iree_linalg_ext.sort dimension(2) outs(%1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} | |
return | |
} | |
} | |
} | |
// -----// IR Dump After ConvertHALDescriptorTypeToGPUAddressSpacePass (iree-codegen-convert-hal-descriptor-type-to-gpu-address-space) //----- // | |
module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #gpu.address_space<global>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #gpu.address_space<global>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
iree_linalg_ext.sort dimension(2) outs(%1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} | |
return | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #gpu.address_space<global>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #gpu.address_space<global>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
iree_linalg_ext.sort dimension(2) outs(%1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} | |
return | |
} | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #gpu.address_space<global>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #gpu.address_space<global>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
iree_linalg_ext.sort dimension(2) outs(%1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} | |
return | |
} | |
} | |
// -----// IR Dump After LowerUKernelOpsToCallsPass (iree-codegen-lower-ukernel-ops-to-calls) //----- // | |
module { | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #gpu.address_space<global>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #gpu.address_space<global>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
iree_linalg_ext.sort dimension(2) outs(%1 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>>) { | |
^bb0(%arg0: i32, %arg1: i32): | |
%2 = arith.cmpi slt, %arg0, %arg1 : i32 | |
iree_linalg_ext.yield %2 : i1 | |
} | |
return | |
} | |
} | |
// -----// IR Dump After LinalgExtToLoopsPass (iree-linalg-ext-to-loops) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c3 = arith.constant 3 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #gpu.address_space<global>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #gpu.address_space<global>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
scf.for %arg0 = %c0 to %c1 step %c1 { | |
scf.for %arg1 = %c0 to %c2 step %c1 { | |
scf.for %arg2 = %c0 to %c4 step %c1 { | |
scf.for %arg3 = %c0 to %c3 step %c1 { | |
%2 = arith.addi %arg3, %c1 : index | |
%3 = memref.load %1[%arg0, %arg1, %arg3] : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
%4 = memref.load %1[%arg0, %arg1, %2] : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
%5 = arith.cmpi slt, %3, %4 : i32 | |
scf.if %5 { | |
} else { | |
%6 = arith.addi %arg3, %c1 : index | |
memref.store %4, %1[%arg0, %arg1, %arg3] : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.store %3, %1[%arg0, %arg1, %6] : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
} | |
} | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After MemrefCopyToLinalgPass (iree-codegen-memrefcopy-to-linalg) //----- // | |
func.func @_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store() { | |
%c3 = arith.constant 3 : index | |
%c4 = arith.constant 4 : index | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(Indirect) : memref<1x2x4xi32, #gpu.address_space<global>> | |
%1 = amdgpu.fat_raw_buffer_cast %0 resetOffset : memref<1x2x4xi32, #gpu.address_space<global>> to memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x2x4xi32, #amdgpu.address_space<fat_raw_buffer>> | |
scf.for %arg0 = %c0 to %c1 step %c1 { | |
scf.for %arg1 = %c0 to %c2 step %c1 { | |
scf.for %arg2 = %c0 to %c4 step %c1 { | |
scf.for %arg3 = %c0 to %c3 step %c1 { | |
%2 = arith.addi %arg3, %c1 : index | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment