Skip to content

Instantly share code, notes, and snippets.

@kuhar
Created February 24, 2023 20:06
Show Gist options
  • Save kuhar/f18f9772013871c6327a57cffda0769d to your computer and use it in GitHub Desktop.
Save kuhar/f18f9772013871c6327a57cffda0769d to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
// -----// IR Dump After TosaToSCF (tosa-to-scf) //----- //
func.func @main() {
%0 = util.unfoldable_constant dense<1> : tensor<1024x1024xi8>
%1 = util.unfoldable_constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq_const(%4, dense<1024> : tensor<1024x1024xi32>) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After TopLevelSCFToCFG (iree-top-level-scf-to-cfg) //----- //
func.func @main() {
%0 = util.unfoldable_constant dense<1> : tensor<1024x1024xi8>
%1 = util.unfoldable_constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq_const(%4, dense<1024> : tensor<1024x1024xi32>) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After Inliner (inline) //----- //
module {
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After TosaMakeBroadcastable (tosa-make-broadcastable) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After TosaToArith (tosa-to-arith) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After TosaToTensor (tosa-to-tensor) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After TosaToLinalgExt (iree-tosa-to-linalg-ext) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After TosaOptionalDecompositions (tosa-optional-decompositions) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After TosaMakeBroadcastable (tosa-make-broadcastable) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After TosaToLinalgNamed (tosa-to-linalg-named) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After TosaLayerwiseConstantFoldPass (tosa-layerwise-constant-fold) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After TosaMakeBroadcastable (tosa-make-broadcastable) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After TosaValidation (tosa-validate) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After TosaToLinalg (tosa-to-linalg) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After TosaToArith (tosa-to-arith) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After TosaToTensor (tosa-to-tensor) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After StripSignedness (iree-flow-strip-signedness) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-linalg-quantized-matmul-to-matmul) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After LinalgQuantizedConvToConvPass (iree-linalg-quantized-conv-to-conv) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After VerifyCompilerTOSAInputLegality (iree-tosa-verify-compiler-input-legality) //----- //
module {
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After IREEImportPublic (iree-import-public) //----- //
module {
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After ImportMLProgram (iree-import-ml-program) //----- //
module {
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After SanitizeModuleNames (iree-sanitize-module-names) //----- //
module {
func.func @main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After Inliner (inline) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After DemoteF64ToF32 (iree-util-demote-f64-to-f32) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After DetachElementwiseFromNamedOps (iree-flow-detach-elementwise-from-named-ops) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After LinalgNamedOpConversion (linalg-named-op-conversion) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After Convert1X1FilterConv2DToMatmul (iree-flow-convert-1x1-filter-conv2d-to-matmul) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After DetachElementwiseFromNamedOps (iree-flow-detach-elementwise-from-named-ops) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After LinalgNamedOpConversion (linalg-named-op-conversion) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After Convert1X1FilterConv2DToMatmul (iree-flow-convert-1x1-filter-conv2d-to-matmul) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After EraseUnusedLinalgOperands (iree-flow-erase-unused-linalg-operands) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After VerifyInputLegality (iree-verify-input-legality) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After ExpandTensorShapes (iree-flow-expand-tensor-shapes) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After TensorPadToTensorInsertSlice (iree-flow-tensor-pad-to-tensor-insert-slice) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After ConvertElementwiseToLinalg (convert-elementwise-to-linalg) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After LinalgFoldUnitExtentDims (linalg-fold-unit-extent-dims) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After RaiseSpecialOps (iree-flow-raise-special-ops) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After ConvertElementwiseToLinalg (convert-elementwise-to-linalg) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After LinalgFoldUnitExtentDims (linalg-fold-unit-extent-dims) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After FusionOfTensorOps (iree-flow-fusion-of-tensor-ops) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After RaiseSpecialOps (iree-flow-raise-special-ops) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After LinalgDetensorize (linalg-detensorize) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After CollapseDims (iree-flow-collapse-dims) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After SplitReduction (iree-flow-split-reduction-ops) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After FormDispatchRegions (iree-flow-form-dispatch-regions) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After FusionOfTensorOps (iree-flow-fusion-of-tensor-ops) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After CollapseDimensions (iree-flow-collapse-dimensions) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After LinalgDetensorize (linalg-detensorize) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After FormDispatchWorkgroups (iree-flow-form-dispatch-workgroups) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After CaptureDispatchDynamicDims (iree-flow-capture-dispatch-dynamic-dims) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After CollapseDims (iree-flow-collapse-dims) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After SplitReduction (iree-flow-split-reduction-ops) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After FormDispatchRegions (iree-flow-form-dispatch-regions) //----- //
func.func private @_main() {
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1_1 = arith.constant 1 : index
%4 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 - s0) ceildiv s2)>()[%c0, %c1024, %c1_1]
%c0_2 = arith.constant 0 : index
%c1024_3 = arith.constant 1024 : index
%c1_4 = arith.constant 1 : index
%5 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 - s0) ceildiv s2)>()[%c0_2, %c1024_3, %c1_4]
%c0_5 = arith.constant 0 : index
%c1_6 = arith.constant 1 : index
%6 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 - s0) ceildiv s2)>()[%c0_5, %c1, %c1_6]
%7 = flow.dispatch.region[%4, %5, %6] -> (tensor<1024x1024xi32>) {
%8 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.return %8 : tensor<1024x1024xi32>
} count(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
check.expect_eq(%7, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After CollapseDimensions (iree-flow-collapse-dimensions) //----- //
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%c0_i32 = arith.constant 0 : i32
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = flow.dispatch.region[%c1024, %c1024, %c1] -> (tensor<1024x1024xi32>) {
%5 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.return %5 : tensor<1024x1024xi32>
} count(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
check.expect_eq(%4, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After FormDispatchWorkgroups (iree-flow-form-dispatch-workgroups) //----- //
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch.workgroups[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> =
(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) {
%c0_i32 = arith.constant 0 : i32
%3 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
flow.return
} count(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After CaptureDispatchDynamicDims (iree-flow-capture-dispatch-dynamic-dims) //----- //
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch.workgroups[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> =
(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) {
%c0_i32 = arith.constant 0 : i32
%3 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
flow.return
} count(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch.workgroups[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> =
(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) {
%c0_i32 = arith.constant 0 : i32
%3 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
flow.return
} count(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch.workgroups[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> =
(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) {
%c0_i32 = arith.constant 0 : i32
%3 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
flow.return
} count(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After InitializeEmptyTensors (iree-flow-initialize-empty-tensors) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch.workgroups[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32> =
(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) {
%c0_i32 = arith.constant 0 : i32
%3 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
flow.return
} count(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After OutlineDispatchRegions (iree-flow-outline-dispatch-regions) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
flow.executable private @_main_dispatch_0 {
flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) {
%c0_i32 = arith.constant 0 : i32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32>
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- //
flow.executable private @_main_dispatch_0 {
flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) {
%c0_i32 = arith.constant 0 : i32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32>
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After DeduplicateExecutables (iree-flow-deduplicate-executables) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
flow.executable private @_main_dispatch_0 {
flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) {
%c0_i32 = arith.constant 0 : i32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32>
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After CleanupTensorShapes (iree-flow-cleanup-tensor-shapes) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After CleanupTensorShapes (iree-flow-cleanup-tensor-shapes) //----- //
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32>
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32>
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32>
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
flow.executable private @_main_dispatch_0 {
flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) {
%c0_i32 = arith.constant 0 : i32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
// -----// IR Dump After CSE (cse) //----- //
flow.executable private @_main_dispatch_0 {
flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) {
%c0_i32 = arith.constant 0 : i32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
flow.executable private @_main_dispatch_0 {
flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) {
%c0_i32 = arith.constant 0 : i32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32>
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After VerifyInput (iree-stream-verify-input) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
flow.executable private @_main_dispatch_0 {
flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) {
%c0_i32 = arith.constant 0 : i32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32>
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After OutlineConstants (iree-stream-outline-constants) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
flow.executable private @_main_dispatch_0 {
flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) {
%c0_i32 = arith.constant 0 : i32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32>
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32>
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32>
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32>
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
flow.executable private @_main_dispatch_0 {
flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) {
%c0_i32 = arith.constant 0 : i32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32>
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
flow.executable private @_main_dispatch_0 {
flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) {
%c0_i32 = arith.constant 0 : i32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32>
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
flow.executable private @_main_dispatch_0 {
flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) {
%c0_i32 = arith.constant 0 : i32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32>
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
flow.executable private @_main_dispatch_0 {
flow.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>) {
%c0_i32 = arith.constant 0 : i32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%2 = tensor.empty() : tensor<1024x1024xi32>
%3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%4 = linalg.matmul ins(%0, %1 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%3 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = arith.constant dense<1024> : tensor<1024x1024xi32>
%cst_0 = arith.constant dense<1> : tensor<1024x1024xi8>
%0 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%1 = util.optimization_barrier %cst_0 : tensor<1024x1024xi8>
%2 = flow.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%0, %1) : (tensor<1024x1024xi8>, tensor<1024x1024xi8>) -> tensor<1024x1024xi32>
check.expect_eq(%2, %cst) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After ConvertToStream (iree-stream-conversion) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%c0_i32 = arith.constant 0 : i32
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = stream.tensor.constant : tensor<1024x1024xi32> in !stream.resource<constant> = dense<1024> : tensor<1024x1024xi32>
%0 = stream.resource.size %cst : !stream.resource<constant>
%1 = stream.async.transfer %cst : !stream.resource<constant>{%0} -> !stream.resource<*>{%0}
%cst_0 = stream.tensor.constant : tensor<1024x1024xi8> in !stream.resource<constant> = dense<1> : tensor<1024x1024xi8>
%2 = stream.resource.size %cst_0 : !stream.resource<constant>
%3 = stream.async.transfer %cst_0 : !stream.resource<constant>{%2} -> !stream.resource<*>{%2}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %3 : !stream.resource<*>
%c0 = arith.constant 0 : index
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.tensor.sizeof tensor<1024x1024xi32> : index
%9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8}
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} -> !stream.resource<external>{%8}
%11 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%8} -> tensor<1024x1024xi32>
%12 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%13 = stream.tensor.export %12 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32>
check.expect_eq(%11, %13) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After VerifyLoweringToTensors (iree-stream-verify-lowering-to-tensors) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%c0_i32 = arith.constant 0 : i32
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%cst = stream.tensor.constant : tensor<1024x1024xi32> in !stream.resource<constant> = dense<1024> : tensor<1024x1024xi32>
%0 = stream.resource.size %cst : !stream.resource<constant>
%1 = stream.async.transfer %cst : !stream.resource<constant>{%0} -> !stream.resource<*>{%0}
%cst_0 = stream.tensor.constant : tensor<1024x1024xi8> in !stream.resource<constant> = dense<1> : tensor<1024x1024xi8>
%2 = stream.resource.size %cst_0 : !stream.resource<constant>
%3 = stream.async.transfer %cst_0 : !stream.resource<constant>{%2} -> !stream.resource<*>{%2}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %3 : !stream.resource<*>
%c0 = arith.constant 0 : index
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.tensor.sizeof tensor<1024x1024xi32> : index
%9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8}
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} -> !stream.resource<external>{%8}
%11 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%8} -> tensor<1024x1024xi32>
%12 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%13 = stream.tensor.export %12 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32>
check.expect_eq(%11, %13) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_main() {
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.tensor.sizeof tensor<1024x1024xi32> : index
%1 = stream.tensor.splat %c1024_i32 : i32 -> tensor<1024x1024xi32> in !stream.resource<*>{%0}
%2 = stream.tensor.sizeof tensor<1024x1024xi8> : index
%3 = stream.tensor.splat %c1_i8 : i8 -> tensor<1024x1024xi8> in !stream.resource<*>{%2}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %3 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.tensor.sizeof tensor<1024x1024xi32> : index
%9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8}
%10 = stream.async.transfer %9 : !stream.resource<*>{%8} -> !stream.resource<external>{%8}
%11 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%8} -> tensor<1024x1024xi32>
%12 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%13 = stream.tensor.export %12 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32>
check.expect_eq(%11, %13) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func private @_main() {
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.tensor.sizeof tensor<1024x1024xi32> : index
%1 = stream.tensor.splat %c1024_i32 : i32 -> tensor<1024x1024xi32> in !stream.resource<*>{%0}
%2 = stream.tensor.sizeof tensor<1024x1024xi8> : index
%3 = stream.tensor.splat %c1_i8 : i8 -> tensor<1024x1024xi8> in !stream.resource<*>{%2}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %3 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%0}
%9 = stream.async.transfer %8 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32>
%11 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%12 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32>
check.expect_eq(%10, %12) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_main() {
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.tensor.sizeof tensor<1024x1024xi32> : index
%1 = stream.tensor.splat %c1024_i32 : i32 -> tensor<1024x1024xi32> in !stream.resource<*>{%0}
%2 = stream.tensor.sizeof tensor<1024x1024xi8> : index
%3 = stream.tensor.splat %c1_i8 : i8 -> tensor<1024x1024xi8> in !stream.resource<*>{%2}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %3 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%0}
%9 = stream.async.transfer %8 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32>
%11 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%12 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32>
check.expect_eq(%10, %12) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.tensor.sizeof tensor<1024x1024xi32> : index
%1 = stream.tensor.splat %c1024_i32 : i32 -> tensor<1024x1024xi32> in !stream.resource<*>{%0}
%2 = stream.tensor.sizeof tensor<1024x1024xi8> : index
%3 = stream.tensor.splat %c1_i8 : i8 -> tensor<1024x1024xi8> in !stream.resource<*>{%2}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %3 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%0}
%9 = stream.async.transfer %8 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32>
%11 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%12 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32>
check.expect_eq(%10, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.tensor.sizeof tensor<1024x1024xi32> : index
%1 = stream.tensor.splat %c1024_i32 : i32 -> tensor<1024x1024xi32> in !stream.resource<*>{%0}
%2 = stream.tensor.sizeof tensor<1024x1024xi8> : index
%3 = stream.tensor.splat %c1_i8 : i8 -> tensor<1024x1024xi8> in !stream.resource<*>{%2}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %3 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%0}
%9 = stream.async.transfer %8 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32>
%11 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%12 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32>
check.expect_eq(%10, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.tensor.sizeof tensor<1024x1024xi32> : index
%1 = stream.tensor.splat %c1024_i32 : i32 -> tensor<1024x1024xi32> in !stream.resource<*>{%0}
%2 = stream.tensor.sizeof tensor<1024x1024xi8> : index
%3 = stream.tensor.splat %c1_i8 : i8 -> tensor<1024x1024xi8> in !stream.resource<*>{%2}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %3 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%0}
%9 = stream.async.transfer %8 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32>
%11 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%12 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32>
check.expect_eq(%10, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.tensor.sizeof tensor<1024x1024xi32> : index
%1 = stream.tensor.splat %c1024_i32 : i32 -> tensor<1024x1024xi32> in !stream.resource<*>{%0}
%2 = stream.tensor.sizeof tensor<1024x1024xi8> : index
%3 = stream.tensor.splat %c1_i8 : i8 -> tensor<1024x1024xi8> in !stream.resource<*>{%2}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %3 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%0}
%9 = stream.async.transfer %8 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32>
%11 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%12 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32>
check.expect_eq(%10, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After CombineInitializers (iree-util-combine-initializers) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.tensor.sizeof tensor<1024x1024xi32> : index
%1 = stream.tensor.splat %c1024_i32 : i32 -> tensor<1024x1024xi32> in !stream.resource<*>{%0}
%2 = stream.tensor.sizeof tensor<1024x1024xi8> : index
%3 = stream.tensor.splat %c1_i8 : i8 -> tensor<1024x1024xi8> in !stream.resource<*>{%2}
%4 = util.optimization_barrier %3 : !stream.resource<*>
%5 = util.optimization_barrier %3 : !stream.resource<*>
%6 = stream.resource.size %4 : !stream.resource<*>
%7 = stream.resource.size %5 : !stream.resource<*>
%8 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%0}
%9 = stream.async.transfer %8 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32>
%11 = stream.async.transfer %1 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
%12 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%0} -> tensor<1024x1024xi32>
check.expect_eq(%10, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After EncodeHostTensors (iree-stream-encode-host-tensors) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After EncodeHostTensors (iree-stream-encode-host-tensors) //----- //
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304}
%1 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576}
%2 = util.optimization_barrier %1 : !stream.resource<*>
%3 = util.optimization_barrier %1 : !stream.resource<*>
%4 = stream.resource.size %2 : !stream.resource<*>
%5 = stream.resource.size %3 : !stream.resource<*>
%6 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%2[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304}
%7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%8 = stream.tensor.export %7 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%9 = stream.async.transfer %0 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%8, %10) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After EncodeDeviceTensors (iree-stream-encode-device-tensors) //----- //
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
// -----// IR Dump After MaterializeBuiltins (iree-stream-materialize-builtins) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304}
%1 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576}
%2 = util.optimization_barrier %1 : !stream.resource<*>
%3 = util.optimization_barrier %1 : !stream.resource<*>
%4 = stream.resource.size %2 : !stream.resource<*>
%5 = stream.resource.size %3 : !stream.resource<*>
%6 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%2[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304}
%7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%8 = stream.tensor.export %7 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%9 = stream.async.transfer %0 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%8, %10) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<*>
%2 = util.optimization_barrier %0 : !stream.resource<*>
%3 = stream.resource.size %1 : !stream.resource<*>
%4 = stream.resource.size %2 : !stream.resource<*>
%5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304}
%6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304}
%9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %10) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<*>
%2 = util.optimization_barrier %0 : !stream.resource<*>
%3 = stream.resource.size %1 : !stream.resource<*>
%4 = stream.resource.size %2 : !stream.resource<*>
%5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304}
%6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304}
%9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %10) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<*>
%2 = util.optimization_barrier %0 : !stream.resource<*>
%3 = stream.resource.size %1 : !stream.resource<*>
%4 = stream.resource.size %2 : !stream.resource<*>
%5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304}
%6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304}
%9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %10) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<*>
%2 = util.optimization_barrier %0 : !stream.resource<*>
%3 = stream.resource.size %1 : !stream.resource<*>
%4 = stream.resource.size %2 : !stream.resource<*>
%5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304}
%6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304}
%9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %10) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<*>
%2 = util.optimization_barrier %0 : !stream.resource<*>
%3 = stream.resource.size %1 : !stream.resource<*>
%4 = stream.resource.size %2 : !stream.resource<*>
%5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304}
%6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304}
%9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %10) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<*>
%2 = util.optimization_barrier %0 : !stream.resource<*>
%3 = stream.resource.size %1 : !stream.resource<*>
%4 = stream.resource.size %2 : !stream.resource<*>
%5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304}
%6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304}
%9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %10) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<*>
%2 = util.optimization_barrier %0 : !stream.resource<*>
%3 = stream.resource.size %1 : !stream.resource<*>
%4 = stream.resource.size %2 : !stream.resource<*>
%5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304}
%6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304}
%9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %10) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After MaterializeCopyOnWrite (iree-stream-materialize-copy-on-write) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After MaterializeCopyOnWrite (iree-stream-materialize-copy-on-write) //----- //
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<*>
%2 = util.optimization_barrier %0 : !stream.resource<*>
%3 = stream.resource.size %1 : !stream.resource<*>
%4 = stream.resource.size %2 : !stream.resource<*>
%5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304}
%6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304}
%9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %10) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After ElideAsyncCopies (iree-stream-elide-async-copies) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<*>
%2 = util.optimization_barrier %0 : !stream.resource<*>
%3 = stream.resource.size %1 : !stream.resource<*>
%4 = stream.resource.size %2 : !stream.resource<*>
%5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304}
%6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304}
%9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %10) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After EmplaceAllocations (iree-stream-emplace-allocations) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<*>
%2 = util.optimization_barrier %0 : !stream.resource<*>
%3 = stream.resource.size %1 : !stream.resource<*>
%4 = stream.resource.size %2 : !stream.resource<*>
%5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304}
%6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304}
%9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %10) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After EmplaceAllocations (iree-stream-emplace-allocations) //----- //
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<*>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<*>
%2 = util.optimization_barrier %0 : !stream.resource<*>
%3 = stream.resource.size %1 : !stream.resource<*>
%4 = stream.resource.size %2 : !stream.resource<*>
%5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<*>{%3}, !stream.resource<*>{%4}) -> !stream.resource<*>{%c4194304}
%6 = stream.async.transfer %5 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%7 = stream.tensor.export %6 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%8 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<*>{%c4194304}
%9 = stream.async.transfer %8 : !stream.resource<*>{%c4194304} -> !stream.resource<external>{%c4194304}
%10 = stream.tensor.export %9 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %10) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After RefineUsage (iree-stream-refine-usage) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<transient>
%2 = util.optimization_barrier %0 : !stream.resource<transient>
%3 = stream.resource.size %1 : !stream.resource<transient>
%4 = stream.resource.size %2 : !stream.resource<transient>
%5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304}
%6 = stream.tensor.export %5 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%7 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304}
%8 = stream.tensor.export %7 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%6, %8) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<transient>
%2 = util.optimization_barrier %0 : !stream.resource<transient>
%3 = stream.resource.size %1 : !stream.resource<transient>
%4 = stream.resource.size %2 : !stream.resource<transient>
%5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304}
%6 = stream.tensor.export %5 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%7 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304}
%8 = stream.tensor.export %7 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%6, %8) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<transient>
%2 = util.optimization_barrier %0 : !stream.resource<transient>
%3 = stream.resource.size %1 : !stream.resource<transient>
%4 = stream.resource.size %2 : !stream.resource<transient>
%5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304}
%6 = stream.tensor.export %5 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%7 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304}
%8 = stream.tensor.export %7 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%6, %8) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<transient>
%2 = util.optimization_barrier %0 : !stream.resource<transient>
%3 = stream.resource.size %1 : !stream.resource<transient>
%4 = stream.resource.size %2 : !stream.resource<transient>
%5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304}
%6 = stream.tensor.export %5 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%7 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304}
%8 = stream.tensor.export %7 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%6, %8) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<transient>
%2 = util.optimization_barrier %0 : !stream.resource<transient>
%3 = stream.resource.size %1 : !stream.resource<transient>
%4 = stream.resource.size %2 : !stream.resource<transient>
%5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304}
%6 = stream.tensor.export %5 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%7 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304}
%8 = stream.tensor.export %7 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%6, %8) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<transient>
%2 = util.optimization_barrier %0 : !stream.resource<transient>
%3 = stream.resource.size %1 : !stream.resource<transient>
%4 = stream.resource.size %2 : !stream.resource<transient>
%5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304}
%6 = stream.tensor.export %5 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%7 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304}
%8 = stream.tensor.export %7 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%6, %8) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<transient>
%2 = util.optimization_barrier %0 : !stream.resource<transient>
%3 = stream.resource.size %1 : !stream.resource<transient>
%4 = stream.resource.size %2 : !stream.resource<transient>
%5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304}
%6 = stream.tensor.export %5 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%7 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304}
%8 = stream.tensor.export %7 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%6, %8) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<transient>
%2 = util.optimization_barrier %0 : !stream.resource<transient>
%3 = stream.resource.size %1 : !stream.resource<transient>
%4 = stream.resource.size %2 : !stream.resource<transient>
%5 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%1[%c0 to %3 for %3], %2[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304}
%6 = stream.tensor.export %5 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%7 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304}
%8 = stream.tensor.export %7 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%6, %8) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After ScheduleExecution (iree-stream-schedule-execution) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After ScheduleConcurrency (iree-stream-schedule-concurrency) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After ScheduleExecution (iree-stream-schedule-execution) //----- //
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} {
%8 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576}
stream.yield %8 : !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<transient>
%2 = util.optimization_barrier %0 : !stream.resource<transient>
%3 = stream.resource.size %1 : !stream.resource<transient>
%4 = stream.resource.size %2 : !stream.resource<transient>
%results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%8 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg0[%c0 to %3 for %3], %arg1[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304}
%9 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304}
stream.yield %8, %9 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
} => !stream.timepoint
%5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
%6 = stream.tensor.export %5#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%7 = stream.tensor.export %5#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %6) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After ScheduleConcurrency (iree-stream-schedule-concurrency) //----- //
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} {
%8 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576}
stream.yield %8 : !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<transient>
%2 = util.optimization_barrier %0 : !stream.resource<transient>
%3 = stream.resource.size %1 : !stream.resource<transient>
%4 = stream.resource.size %2 : !stream.resource<transient>
%results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%8:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<transient>{%3}, %arg1 as %arg3: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg2[%c0 to %3 for %3], %arg3[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304}
%10 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304}
stream.yield %9, %10 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
}
stream.yield %8#0, %8#1 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
} => !stream.timepoint
%5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
%6 = stream.tensor.export %5#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%7 = stream.tensor.export %5#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %6) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After PropagateTimepoints (iree-stream-propagate-timepoints) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} {
%11 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576}
stream.yield %11 : !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<transient>
%2 = util.optimization_barrier %0 : !stream.resource<transient>
%3 = stream.resource.size %1 : !stream.resource<transient>
%4 = stream.resource.size %2 : !stream.resource<transient>
%5 = stream.timepoint.immediate => !stream.timepoint
%6 = stream.timepoint.immediate => !stream.timepoint
%7 = stream.timepoint.immediate => !stream.timepoint
%results_0:2, %result_timepoint_1 = stream.async.execute await(%7) => with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%11:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<transient>{%3}, %arg1 as %arg3: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%12 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg2[%c0 to %3 for %3], %arg3[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304}
%13 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304}
stream.yield %12, %13 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
}
stream.yield %11#0, %11#1 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
} => !stream.timepoint
%8:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
%9 = stream.tensor.export %8#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%10 = stream.tensor.export %8#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%10, %9) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} {
%8 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576}
stream.yield %8 : !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<transient>
%2 = util.optimization_barrier %0 : !stream.resource<transient>
%3 = stream.resource.size %1 : !stream.resource<transient>
%4 = stream.resource.size %2 : !stream.resource<transient>
%results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%8:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<transient>{%3}, %arg1 as %arg3: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg2[%c0 to %3 for %3], %arg3[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304}
%10 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304}
stream.yield %9, %10 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
}
stream.yield %8#0, %8#1 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
} => !stream.timepoint
%5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
%6 = stream.tensor.export %5#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%7 = stream.tensor.export %5#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %6) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} {
%8 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576}
stream.yield %8 : !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<transient>
%2 = util.optimization_barrier %0 : !stream.resource<transient>
%3 = stream.resource.size %1 : !stream.resource<transient>
%4 = stream.resource.size %2 : !stream.resource<transient>
%results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%8:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<transient>{%3}, %arg1 as %arg3: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg2[%c0 to %3 for %3], %arg3[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304}
%10 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304}
stream.yield %9, %10 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
}
stream.yield %8#0, %8#1 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
} => !stream.timepoint
%5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
%6 = stream.tensor.export %5#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%7 = stream.tensor.export %5#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %6) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} {
%8 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576}
stream.yield %8 : !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<transient>
%2 = util.optimization_barrier %0 : !stream.resource<transient>
%3 = stream.resource.size %1 : !stream.resource<transient>
%4 = stream.resource.size %2 : !stream.resource<transient>
%results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%8:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<transient>{%3}, %arg1 as %arg3: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg2[%c0 to %3 for %3], %arg3[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304}
%10 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304}
stream.yield %9, %10 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
}
stream.yield %8#0, %8#1 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
} => !stream.timepoint
%5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
%6 = stream.tensor.export %5#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%7 = stream.tensor.export %5#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %6) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} {
%8 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576}
stream.yield %8 : !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<transient>
%2 = util.optimization_barrier %0 : !stream.resource<transient>
%3 = stream.resource.size %1 : !stream.resource<transient>
%4 = stream.resource.size %2 : !stream.resource<transient>
%results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%8:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<transient>{%3}, %arg1 as %arg3: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg2[%c0 to %3 for %3], %arg3[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304}
%10 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304}
stream.yield %9, %10 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
}
stream.yield %8#0, %8#1 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
} => !stream.timepoint
%5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
%6 = stream.tensor.export %5#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%7 = stream.tensor.export %5#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %6) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} {
%8 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576}
stream.yield %8 : !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<transient>
%2 = util.optimization_barrier %0 : !stream.resource<transient>
%3 = stream.resource.size %1 : !stream.resource<transient>
%4 = stream.resource.size %2 : !stream.resource<transient>
%results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%8:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<transient>{%3}, %arg1 as %arg3: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg2[%c0 to %3 for %3], %arg3[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304}
%10 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304}
stream.yield %9, %10 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
}
stream.yield %8#0, %8#1 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
} => !stream.timepoint
%5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
%6 = stream.tensor.export %5#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%7 = stream.tensor.export %5#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %6) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} {
%8 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576}
stream.yield %8 : !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<transient>
%2 = util.optimization_barrier %0 : !stream.resource<transient>
%3 = stream.resource.size %1 : !stream.resource<transient>
%4 = stream.resource.size %2 : !stream.resource<transient>
%results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%8:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<transient>{%3}, %arg1 as %arg3: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg2[%c0 to %3 for %3], %arg3[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304}
%10 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304}
stream.yield %9, %10 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
}
stream.yield %8#0, %8#1 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
} => !stream.timepoint
%5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
%6 = stream.tensor.export %5#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%7 = stream.tensor.export %5#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %6) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} {
%8 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576}
stream.yield %8 : !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<transient>
%2 = util.optimization_barrier %0 : !stream.resource<transient>
%3 = stream.resource.size %1 : !stream.resource<transient>
%4 = stream.resource.size %2 : !stream.resource<transient>
%results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%8:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<transient>{%3}, %arg1 as %arg3: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg2[%c0 to %3 for %3], %arg3[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304}
%10 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304}
stream.yield %9, %10 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
}
stream.yield %8#0, %8#1 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
} => !stream.timepoint
%5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
%6 = stream.tensor.export %5#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%7 = stream.tensor.export %5#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %6) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After VerifyLoweringToAsync (iree-stream-verify-lowering-to-async) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%results, %result_timepoint = stream.async.execute with() -> !stream.resource<transient>{%c1048576} {
%8 = stream.async.splat %c1_i8 : i8 -> !stream.resource<transient>{%c1048576}
stream.yield %8 : !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c1048576}
%1 = util.optimization_barrier %0 : !stream.resource<transient>
%2 = util.optimization_barrier %0 : !stream.resource<transient>
%3 = stream.resource.size %1 : !stream.resource<transient>
%4 = stream.resource.size %2 : !stream.resource<transient>
%results_0:2, %result_timepoint_1 = stream.async.execute with(%1 as %arg0: !stream.resource<transient>{%3}, %2 as %arg1: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%8:2 = stream.async.concurrent with(%arg0 as %arg2: !stream.resource<transient>{%3}, %arg1 as %arg3: !stream.resource<transient>{%4}) -> (!stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}) {
%9 = stream.async.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%arg2[%c0 to %3 for %3], %arg3[%c0 to %4 for %4]) : (!stream.resource<transient>{%3}, !stream.resource<transient>{%4}) -> !stream.resource<external>{%c4194304}
%10 = stream.async.splat %c1024_i32 : i32 -> !stream.resource<external>{%c4194304}
stream.yield %9, %10 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
}
stream.yield %8#0, %8#1 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
} => !stream.timepoint
%5:2 = stream.timepoint.await %result_timepoint_1 => %results_0#1, %results_0#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
%6 = stream.tensor.export %5#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%7 = stream.tensor.export %5#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%7, %6) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After ScheduleAllocation (iree-stream-schedule-allocation) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After PackConstants (iree-stream-pack-constants) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After PackAllocations (iree-stream-pack-allocations) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After LayoutSlices (iree-stream-layout-slices) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After ScheduleAllocation (iree-stream-schedule-allocation) //----- //
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%c0_0 = arith.constant 0 : index
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0_0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%c0_1 = arith.constant 0 : index
%7:2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7#0 as %arg2: !stream.resource<external>{%c4194304}, %7#1 as %arg3: !stream.resource<external>{%c4194304}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0_1 for %c4194304] : !stream.resource<external>{%c4194304}
}
stream.cmd.fill %c1024_i32, %arg3[%c0_1 for %c4194304] : i32 -> !stream.resource<external>{%c4194304}
}
} => !stream.timepoint
%9:2 = stream.timepoint.await %8 => %7#1, %7#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
%10 = stream.tensor.export %9#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%11 = stream.tensor.export %9#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%11, %10) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After PackConstants (iree-stream-pack-constants) //----- //
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%c0_0 = arith.constant 0 : index
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0_0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%c0_1 = arith.constant 0 : index
%7:2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7#0 as %arg2: !stream.resource<external>{%c4194304}, %7#1 as %arg3: !stream.resource<external>{%c4194304}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0_1 for %c4194304] : !stream.resource<external>{%c4194304}
}
stream.cmd.fill %c1024_i32, %arg3[%c0_1 for %c4194304] : i32 -> !stream.resource<external>{%c4194304}
}
} => !stream.timepoint
%9:2 = stream.timepoint.await %8 => %7#1, %7#0 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
%10 = stream.tensor.export %9#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%11 = stream.tensor.export %9#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%11, %10) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After PackAllocations (iree-stream-pack-allocations) //----- //
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%c0_0 = arith.constant 0 : index
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0_0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%c0_1 = arith.constant 0 : index
%7:3 = stream.resource.pack slices({
[0, 0] = %c4194304,
[0, 0] = %c4194304
}) : index
%8 = stream.resource.alloc uninitialized : !stream.resource<external>{%7#0}
%9 = stream.resource.subview %8[%7#1] : !stream.resource<external>{%7#0} -> !stream.resource<external>{%c4194304}
%10 = stream.resource.subview %8[%7#2] : !stream.resource<external>{%7#0} -> !stream.resource<external>{%c4194304}
%11 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %9 as %arg2: !stream.resource<external>{%c4194304}, %10 as %arg3: !stream.resource<external>{%c4194304}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0_1 for %c4194304] : !stream.resource<external>{%c4194304}
}
stream.cmd.fill %c1024_i32, %arg3[%c0_1 for %c4194304] : i32 -> !stream.resource<external>{%c4194304}
}
} => !stream.timepoint
%12:2 = stream.timepoint.await %11 => %10, %9 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
%13 = stream.tensor.export %12#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%14 = stream.tensor.export %12#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%14, %13) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After LayoutSlices (iree-stream-layout-slices) //----- //
func.func private @_main() {
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%c0_0 = arith.constant 0 : index
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0_0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%c0_1 = arith.constant 0 : index
%c0_2 = arith.constant 0 : index
%c4194304_3 = arith.constant 4194304 : index
%c4194304_4 = arith.constant 4194304 : index
%c8388608 = arith.constant 8388608 : index
%c8388608_5 = arith.constant 8388608 : index
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608_5}
%8 = stream.resource.subview %7[%c0_2] : !stream.resource<external>{%c8388608_5} -> !stream.resource<external>{%c4194304}
%9 = stream.resource.subview %7[%c4194304_4] : !stream.resource<external>{%c8388608_5} -> !stream.resource<external>{%c4194304}
%10 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %8 as %arg2: !stream.resource<external>{%c4194304}, %9 as %arg3: !stream.resource<external>{%c4194304}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0_1 for %c4194304] : !stream.resource<external>{%c4194304}
}
stream.cmd.fill %c1024_i32, %arg3[%c0_1 for %c4194304] : i32 -> !stream.resource<external>{%c4194304}
}
} => !stream.timepoint
%11:2 = stream.timepoint.await %10 => %9, %8 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %11#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After PropagateSubranges (iree-util-propagate-subranges) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c0_0 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%c0_1 = arith.constant 0 : index
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0_1 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%c0_2 = arith.constant 0 : index
%c0_3 = arith.constant 0 : index
%c4194304_4 = arith.constant 4194304 : index
%c4194304_5 = arith.constant 4194304 : index
%c8388608 = arith.constant 8388608 : index
%c8388608_6 = arith.constant 8388608 : index
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608_6}
%8 = stream.resource.subview %7[%c0_3] : !stream.resource<external>{%c8388608_6} -> !stream.resource<external>{%c4194304}
%9 = stream.resource.subview %7[%c4194304_5] : !stream.resource<external>{%c8388608_6} -> !stream.resource<external>{%c4194304}
%10 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %8 as %arg2: !stream.resource<external>{%c4194304}, %9 as %arg3: !stream.resource<external>{%c4194304}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0_0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0_0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0_2 for %c4194304] : !stream.resource<external>{%c4194304}
}
stream.cmd.fill %c1024_i32, %arg3[%c0_2 for %c4194304] : i32 -> !stream.resource<external>{%c4194304}
}
} => !stream.timepoint
%11:2 = stream.timepoint.await %10 => %9, %8 : !stream.resource<external>{%c4194304}, !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %11#0 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11#1 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After VerifyLoweringToCmd (iree-stream-verify-lowering-to-cmd) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- //
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After ElideTimepoints (iree-stream-elide-timepoints) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index} {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FuseDispatchBindings (iree-stream-fuse-dispatch-bindings) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%c0_0 = arith.constant 0 : index
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%c0, %c0, %c0 : index, index, index) {
ro %arg0[%c0_0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0_0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0_0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After AnnotateDispatchArguments (iree-stream-annotate-dispatch-arguments) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: index {stream.values = [0 : index]}, %arg4: index {stream.values = [0 : index]}, %arg5: index {stream.values = [0 : index]}) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%c0_0 = arith.constant 0 : index
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%c0, %c0, %c0 : index, index, index) {
ro %arg0[%c0_0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0_0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0_0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After PackDispatchOperands (iree-stream-pack-dispatch-operands) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32) {
%0 = arith.index_castui %arg3 {stream.values = [0 : index]} : i32 to index
%1 = arith.index_castui %arg4 {stream.values = [0 : index]} : i32 to index
%2 = arith.index_castui %arg5 {stream.values = [0 : index]} : i32 to index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%3 = stream.binding.subspan %arg0[%0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%4 = stream.binding.subspan %arg1[%1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%5 = stream.binding.subspan %arg2[%2] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%8 = tensor.empty() : tensor<1024x1024xi32>
%9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%10 = linalg.matmul ins(%6, %7 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%9 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %10, %5, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%c0_0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%c0_i32_1 = arith.constant 0 : i32
%c0_i32_2 = arith.constant 0 : i32
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%c0_i32, %c0_i32_1, %c0_i32_2 : i32, i32, i32) {
ro %arg0[%c0_0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0_0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0_0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32) {
%0 = arith.index_castui %arg3 {stream.values = [0 : index]} : i32 to index
%1 = arith.index_castui %arg4 {stream.values = [0 : index]} : i32 to index
%2 = arith.index_castui %arg5 {stream.values = [0 : index]} : i32 to index
%c0_i32 = arith.constant 0 : i32
%3 = stream.binding.subspan %arg0[%0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%4 = stream.binding.subspan %arg1[%1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%5 = stream.binding.subspan %arg2[%2] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%8 = tensor.empty() : tensor<1024x1024xi32>
%9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%10 = linalg.matmul ins(%6, %7 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%9 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %10, %5, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%c0_i32 = arith.constant 0 : i32
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1](%c0_i32, %c0_i32, %c0_i32 : i32, i32, i32) {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FoldUniformOperands (iree-stream-fold-uniform-operands) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0_i32 = arith.constant 0 : i32
%0 = arith.index_castui %c0_i32 {stream.values = [0 : index]} : i32 to index
%1 = arith.index_castui %c0_i32 {stream.values = [0 : index]} : i32 to index
%2 = arith.index_castui %c0_i32 {stream.values = [0 : index]} : i32 to index
%c0_i32_0 = arith.constant 0 : i32
%3 = stream.binding.subspan %arg0[%0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%4 = stream.binding.subspan %arg1[%1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%5 = stream.binding.subspan %arg2[%2] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%8 = tensor.empty() : tensor<1024x1024xi32>
%9 = linalg.fill ins(%c0_i32_0 : i32) outs(%8 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%10 = linalg.matmul ins(%6, %7 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%9 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %10, %5, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%c0_i32 = arith.constant 0 : i32
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After MemoizeChannels (iree-stream-memoize-channels) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0_i32 = arith.constant 0 : i32
%0 = arith.index_castui %c0_i32 {stream.values = [0 : index]} : i32 to index
%1 = arith.index_castui %c0_i32 {stream.values = [0 : index]} : i32 to index
%2 = arith.index_castui %c0_i32 {stream.values = [0 : index]} : i32 to index
%c0_i32_0 = arith.constant 0 : i32
%3 = stream.binding.subspan %arg0[%0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%4 = stream.binding.subspan %arg1[%1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%5 = stream.binding.subspan %arg2[%2] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%8 = tensor.empty() : tensor<1024x1024xi32>
%9 = linalg.fill ins(%c0_i32_0 : i32) outs(%8 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%10 = linalg.matmul ins(%6, %7 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%9 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %10, %5, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%c0_i32 = arith.constant 0 : i32
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After IPO (iree-util-ipo) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
module {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16, cooperative_matrix_properties_nv = []>>}>
#device_target_vulkan = #hal.device.target<"vulkan", {executable_targets = [#executable_target_vulkan_spirv_fb], legacy_sync}>
module attributes {hal.device.targets = [#device_target_vulkan]} {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::VerifyTargetEnvironmentPass (iree-hal-verify-target-environment) //----- //
#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16, cooperative_matrix_properties_nv = []>>}>
#device_target_vulkan = #hal.device.target<"vulkan", {executable_targets = [#executable_target_vulkan_spirv_fb], legacy_sync}>
module attributes {hal.device.targets = [#device_target_vulkan]} {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
stream.executable private @_main_dispatch_0 {
stream.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1, %arg2
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::MaterializeInterfacesPass (iree-hal-materialize-interfaces) //----- //
#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16, cooperative_matrix_properties_nv = []>>}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>
#device_target_vulkan = #hal.device.target<"vulkan", {executable_targets = [#executable_target_vulkan_spirv_fb], legacy_sync}>
module attributes {hal.device.targets = [#device_target_vulkan]} {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
hal.executable private @_main_dispatch_0 {
hal.executable.variant public @vulkan_spirv_fb, target = #executable_target_vulkan_spirv_fb {
hal.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::DumpExecutableSourcesPass (iree-hal-dump-executable-sources) //----- //
#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16, cooperative_matrix_properties_nv = []>>}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>
#device_target_vulkan = #hal.device.target<"vulkan", {executable_targets = [#executable_target_vulkan_spirv_fb], legacy_sync}>
module attributes {hal.device.targets = [#device_target_vulkan]} {
func.func @main() attributes {iree.abi.stub} {
call @_main() : () -> ()
return
}
hal.executable private @_main_dispatch_0 {
hal.executable.variant public @vulkan_spirv_fb, target = #executable_target_vulkan_spirv_fb {
hal.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
}
}
func.func private @_main() {
%c8388608 = arith.constant 8388608 : index
%c0 = arith.constant 0 : index
%c4194304 = arith.constant 4194304 : index
%c1048576 = arith.constant 1048576 : index
%c1_i8 = arith.constant 1 : i8
%c1024_i32 = arith.constant 1024 : i32
%c1024 = arith.constant 1024 : index
%c1 = arith.constant 1 : index
%0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1048576}
%1 = stream.cmd.execute with(%0 as %arg0: !stream.resource<transient>{%c1048576}) {
stream.cmd.fill %c1_i8, %arg0[%c0 for %c1048576] : i8 -> !stream.resource<transient>{%c1048576}
} => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%c1048576}
%3 = util.optimization_barrier %2 : !stream.resource<transient>
%4 = util.optimization_barrier %2 : !stream.resource<transient>
%5 = stream.resource.size %3 : !stream.resource<transient>
%6 = stream.resource.size %4 : !stream.resource<transient>
%7 = stream.resource.alloc uninitialized : !stream.resource<external>{%c8388608}
%8 = stream.cmd.execute with(%3 as %arg0: !stream.resource<transient>{%5}, %4 as %arg1: !stream.resource<transient>{%6}, %7 as %arg2: !stream.resource<external>{%c8388608}) {
stream.cmd.concurrent {
stream.cmd.dispatch @_main_dispatch_0::@_main_dispatch_0_matmul_1024x1024x1024[%c1024, %c1024, %c1] {
ro %arg0[%c0 for %5] : !stream.resource<transient>{%5},
ro %arg1[%c0 for %6] : !stream.resource<transient>{%6},
wo %arg2[%c0 for %c8388608] : !stream.resource<external>{%c8388608}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]}
stream.cmd.fill %c1024_i32, %arg2[%c4194304 for %c4194304] : i32 -> !stream.resource<external>{%c8388608}
}
} => !stream.timepoint
%9 = stream.timepoint.await %8 => %7 : !stream.resource<external>{%c8388608}
%10 = stream.resource.subview %9[%c4194304] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%11 = stream.resource.subview %9[%c0] : !stream.resource<external>{%c8388608} -> !stream.resource<external>{%c4194304}
%12 = stream.tensor.export %10 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
%13 = stream.tensor.export %11 : tensor<1024x1024xi32> in !stream.resource<external>{%c4194304} -> tensor<1024x1024xi32>
check.expect_eq(%13, %12) : tensor<1024x1024xi32>
return
}
}
// -----// IR Dump After TypePropagation (iree-codegen-type-propagation) //----- //
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
// -----// IR Dump After BufferizeCopyOnlyDispatches (iree-codegen-bufferize-copy-only-dispatches) //----- //
module {
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
}
// -----// IR Dump After DecomposeSoftmax (iree-linalg-ext-decompose-softmax) //----- //
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
// -----// IR Dump After RematerializeParallelOps (iree-codegen-rematerialize-parallel-ops) //----- //
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x1024xi8>
%5 = tensor.empty() : tensor<1024x1024xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
return
}
// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- //
hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16, cooperative_matrix_properties_nv = []>>}> {
hal.executable.export public @_main_dispatch_0_matmul_1024x1024x1024 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<SPIRVBaseVectorize>, workgroup_size = [8 : index, 2 : index, 1 : index]} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c32 = arith.constant 32 : index
%c128 = arith.constant 128 : index
%c1 = arith.constant 1 : index
hal.return %c32, %c128, %c1 : index, index, index
}
builtin.module {
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%c8 = arith.constant 8 : index
%c32 = arith.constant 32 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%c8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<?x1024xi8>
%8 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, %c32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x?xi8>
%9 = tensor.empty() : tensor<8x32xi32>
%cast = tensor.cast %9 : tensor<8x32xi32> to tensor<?x?xi32>
%10 = linalg.fill ins(%c0_i32 : i32) outs(%cast : tensor<?x?xi32>) -> tensor<?x?xi32>
%11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%7, %8 : tensor<?x1024xi8>, tensor<1024x?xi8>) outs(%10 : tensor<?x?xi32>) -> tensor<?x?xi32>
flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [%c8, %c32], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
}
}
// -----// IR Dump After FuseTensorPadWithConsumer (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%c8 = arith.constant 8 : index
%c32 = arith.constant 32 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%c8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<?x1024xi8>
%8 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, %c32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x?xi8>
%9 = tensor.empty() : tensor<8x32xi32>
%cast = tensor.cast %9 : tensor<8x32xi32> to tensor<?x?xi32>
%10 = linalg.fill ins(%c0_i32 : i32) outs(%cast : tensor<?x?xi32>) -> tensor<?x?xi32>
%11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%7, %8 : tensor<?x1024xi8>, tensor<1024x?xi8>) outs(%10 : tensor<?x?xi32>) -> tensor<?x?xi32>
flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [%c8, %c32], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
// -----// IR Dump After ConvertToDestinationPassingStyle (iree-codegen-convert-to-destination-passing-style) //----- //
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%c8 = arith.constant 8 : index
%c32 = arith.constant 32 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%c8, %c32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<?x?xi32>
%cast = tensor.cast %7 : tensor<?x?xi32> to tensor<8x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%c8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<?x1024xi8>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, %c32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x?xi8>
%cast_0 = tensor.cast %cast : tensor<8x32xi32> to tensor<?x?xi32>
%10 = linalg.fill ins(%c0_i32 : i32) outs(%cast_0 : tensor<?x?xi32>) -> tensor<?x?xi32>
%11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%8, %9 : tensor<?x1024xi8>, tensor<1024x?xi8>) outs(%10 : tensor<?x?xi32>) -> tensor<?x?xi32>
flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [%c8, %c32], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = linalg.fill ins(%c0_i32 : i32) outs(%7 : tensor<8x32xi32>) -> tensor<8x32xi32>
%11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%8, %9 : tensor<8x1024xi8>, tensor<1024x32xi8>) outs(%10 : tensor<8x32xi32>) -> tensor<8x32xi32>
flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = linalg.fill ins(%c0_i32 : i32) outs(%7 : tensor<8x32xi32>) -> tensor<8x32xi32>
%11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%8, %9 : tensor<8x1024xi8>, tensor<1024x32xi8>) outs(%10 : tensor<8x32xi32>) -> tensor<8x32xi32>
flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
}
// -----// IR Dump After FoldAffineMinInDistributedLoops (iree-codegen-fold-affinemin-in-distributed-loops) //----- //
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = linalg.fill ins(%c0_i32 : i32) outs(%7 : tensor<8x32xi32>) -> tensor<8x32xi32>
%11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%8, %9 : tensor<8x1024xi8>, tensor<1024x32xi8>) outs(%10 : tensor<8x32xi32>) -> tensor<8x32xi32>
flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
module {
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = linalg.fill ins(%c0_i32 : i32) outs(%7 : tensor<8x32xi32>) -> tensor<8x32xi32>
%11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%8, %9 : tensor<8x1024xi8>, tensor<1024x32xi8>) outs(%10 : tensor<8x32xi32>) -> tensor<8x32xi32>
flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = linalg.fill ins(%c0_i32 : i32) outs(%7 : tensor<8x32xi32>) -> tensor<8x32xi32>
%11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%8, %9 : tensor<8x1024xi8>, tensor<1024x32xi8>) outs(%10 : tensor<8x32xi32>) -> tensor<8x32xi32>
flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = linalg.fill ins(%c0_i32 : i32) outs(%7 : tensor<8x32xi32>) -> tensor<8x32xi32>
%11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%8, %9 : tensor<8x1024xi8>, tensor<1024x32xi8>) outs(%10 : tensor<8x32xi32>) -> tensor<8x32xi32>
flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
}
// -----// IR Dump After SPIRVCreateFastSlowPath (iree-spirv-create-fast-slow-path) //----- //
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = linalg.fill ins(%c0_i32 : i32) outs(%7 : tensor<8x32xi32>) -> tensor<8x32xi32>
%11 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%8, %9 : tensor<8x1024xi8>, tensor<1024x32xi8>) outs(%10 : tensor<8x32xi32>) -> tensor<8x32xi32>
flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
// -----// IR Dump After SPIRVTile (iree-spirv-tile) //----- //
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) {
%11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%12 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice : tensor<4x4xi32>) -> tensor<4x4xi32>
%13 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %12) -> (tensor<4x4xi32>) {
%extracted_slice_0 = tensor.extract_slice %8[%arg2, %arg6] [4, 4] [1, 1] : tensor<8x1024xi8> to tensor<4x4xi8>
%extracted_slice_1 = tensor.extract_slice %9[%arg6, %arg4] [4, 4] [1, 1] : tensor<1024x32xi8> to tensor<4x4xi8>
%14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%extracted_slice_0, %extracted_slice_1 : tensor<4x4xi8>, tensor<4x4xi8>) outs(%arg7 : tensor<4x4xi32>) -> tensor<4x4xi32>
scf.yield %14 : tensor<4x4xi32>
}
%inserted_slice = tensor.insert_slice %13 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %11 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) {
%11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%12 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice : tensor<4x4xi32>) -> tensor<4x4xi32>
%13 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %12) -> (tensor<4x4xi32>) {
%extracted_slice_0 = tensor.extract_slice %8[%arg2, %arg6] [4, 4] [1, 1] : tensor<8x1024xi8> to tensor<4x4xi8>
%extracted_slice_1 = tensor.extract_slice %9[%arg6, %arg4] [4, 4] [1, 1] : tensor<1024x32xi8> to tensor<4x4xi8>
%14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%extracted_slice_0, %extracted_slice_1 : tensor<4x4xi8>, tensor<4x4xi8>) outs(%arg7 : tensor<4x4xi32>) -> tensor<4x4xi32>
scf.yield %14 : tensor<4x4xi32>
}
%inserted_slice = tensor.insert_slice %13 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %11 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) {
%11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%12 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice : tensor<4x4xi32>) -> tensor<4x4xi32>
%13 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %12) -> (tensor<4x4xi32>) {
%extracted_slice_0 = tensor.extract_slice %8[%arg2, %arg6] [4, 4] [1, 1] : tensor<8x1024xi8> to tensor<4x4xi8>
%extracted_slice_1 = tensor.extract_slice %9[%arg6, %arg4] [4, 4] [1, 1] : tensor<1024x32xi8> to tensor<4x4xi8>
%14 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 32], [4, 4], [0, 0, 4]]>} ins(%extracted_slice_0, %extracted_slice_1 : tensor<4x4xi8>, tensor<4x4xi8>) outs(%arg7 : tensor<4x4xi32>) -> tensor<4x4xi32>
scf.yield %14 : tensor<4x4xi32>
}
%inserted_slice = tensor.insert_slice %13 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %11 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
}
--- After vectorization ---
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%cst = arith.constant dense<0> : vector<4x4xi32>
%c0_i8 = arith.constant 0 : i8
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) {
%11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%12 = vector.transfer_write %cst, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xi32>, tensor<4x4xi32>
%13 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %12) -> (tensor<4x4xi32>) {
%extracted_slice_0 = tensor.extract_slice %8[%arg2, %arg6] [4, 4] [1, 1] : tensor<8x1024xi8> to tensor<4x4xi8>
%extracted_slice_1 = tensor.extract_slice %9[%arg6, %arg4] [4, 4] [1, 1] : tensor<1024x32xi8> to tensor<4x4xi8>
%14 = vector.transfer_read %extracted_slice_0[%c0, %c0], %c0_i8 {in_bounds = [true, true, true], permutation_map = affine_map<(d0, d1) -> (d0, 0, d1)>} : tensor<4x4xi8>, vector<4x4x4xi8>
%15 = vector.transfer_read %extracted_slice_1[%c0, %c0], %c0_i8 {in_bounds = [true, true, true], permutation_map = affine_map<(d0, d1) -> (0, d1, d0)>} : tensor<4x4xi8>, vector<4x4x4xi8>
%16 = vector.transfer_read %arg7[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<4x4xi32>
%17 = arith.extsi %14 : vector<4x4x4xi8> to vector<4x4x4xi32>
%18 = arith.extsi %15 : vector<4x4x4xi8> to vector<4x4x4xi32>
%19 = arith.muli %17, %18 : vector<4x4x4xi32>
%20 = vector.multi_reduction <add>, %19, %16 [2] : vector<4x4x4xi32> to vector<4x4xi32>
%21 = vector.transfer_write %20, %arg7[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xi32>, tensor<4x4xi32>
scf.yield %21 : tensor<4x4xi32>
}
%inserted_slice = tensor.insert_slice %13 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %11 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
--- After peephole optimization ---
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%cst = arith.constant dense<0> : vector<4x4xi32>
%c0_i8 = arith.constant 0 : i8
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) {
%11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%12 = vector.transfer_write %cst, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xi32>, tensor<4x4xi32>
%13 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %12) -> (tensor<4x4xi32>) {
%extracted_slice_0 = tensor.extract_slice %8[%arg2, %arg6] [4, 4] [1, 1] : tensor<8x1024xi8> to tensor<4x4xi8>
%extracted_slice_1 = tensor.extract_slice %9[%arg6, %arg4] [4, 4] [1, 1] : tensor<1024x32xi8> to tensor<4x4xi8>
%14 = vector.transfer_read %extracted_slice_0[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : tensor<4x4xi8>, vector<4x4xi8>
%15 = vector.transfer_read %extracted_slice_1[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : tensor<4x4xi8>, vector<4x4xi8>
%16 = vector.transfer_read %arg7[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<4x4xi32>
%17 = arith.extsi %14 : vector<4x4xi8> to vector<4x4xi32>
%18 = arith.extsi %15 : vector<4x4xi8> to vector<4x4xi32>
%19 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %16 : vector<4x4xi32>, vector<4x4xi32> into vector<4x4xi32>
%20 = vector.transfer_write %19, %arg7[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xi32>, tensor<4x4xi32>
scf.yield %20 : tensor<4x4xi32>
}
%inserted_slice = tensor.insert_slice %13 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %11 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
--- After folding tensor extract/insert slice ops ---
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%cst = arith.constant dense<0> : vector<4x4xi32>
%c0_i8 = arith.constant 0 : i8
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) {
%11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%12 = vector.transfer_write %cst, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xi32>, tensor<4x4xi32>
%13 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %12) -> (tensor<4x4xi32>) {
%14 = vector.transfer_read %8[%arg2, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<4x4xi8>
%15 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<4x4xi8>
%16 = vector.transfer_read %arg7[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<4x4xi32>
%17 = arith.extsi %14 : vector<4x4xi8> to vector<4x4xi32>
%18 = arith.extsi %15 : vector<4x4xi8> to vector<4x4xi32>
%19 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %16 : vector<4x4xi32>, vector<4x4xi32> into vector<4x4xi32>
%20 = vector.transfer_write %19, %arg7[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xi32>, tensor<4x4xi32>
scf.yield %20 : tensor<4x4xi32>
}
%inserted_slice = tensor.insert_slice %13 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %11 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
--- After lowering multi_reduction ops ---
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%cst = arith.constant dense<0> : vector<4x4xi32>
%c0_i8 = arith.constant 0 : i8
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) {
%11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%12 = vector.transfer_write %cst, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xi32>, tensor<4x4xi32>
%13 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %12) -> (tensor<4x4xi32>) {
%14 = vector.transfer_read %8[%arg2, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<4x4xi8>
%15 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<4x4xi8>
%16 = vector.transfer_read %arg7[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<4x4xi32>
%17 = arith.extsi %14 : vector<4x4xi8> to vector<4x4xi32>
%18 = arith.extsi %15 : vector<4x4xi8> to vector<4x4xi32>
%19 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %16 : vector<4x4xi32>, vector<4x4xi32> into vector<4x4xi32>
%20 = vector.transfer_write %19, %arg7[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xi32>, tensor<4x4xi32>
scf.yield %20 : tensor<4x4xi32>
}
%inserted_slice = tensor.insert_slice %13 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %11 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {4, 4, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
JAKUB: #87
JAKUB: #94
JAKUB: #103
JAKUB: #109
JAKUB: #77
JAKUB: #81
JAKUB: lowerToInnerPro: 1
JAKUB: bounds: {1, 1, 4}
JAKUB: nativeSize: {1, 1, 4}
--- After unrolling vector ---
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%cst = arith.constant dense<0> : vector<4x4xi32>
%c0_i8 = arith.constant 0 : i8
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) {
%11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%12 = vector.extract_strided_slice %cst {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%13 = vector.transfer_write %12, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%14 = vector.extract_strided_slice %cst {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%15 = vector.transfer_write %14, %13[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%16 = vector.extract_strided_slice %cst {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%17 = vector.transfer_write %16, %15[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%18 = vector.extract_strided_slice %cst {offsets = [3, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%19 = vector.transfer_write %18, %17[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%20 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %19) -> (tensor<4x4xi32>) {
%21 = vector.transfer_read %8[%arg2, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8>
%22 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
%23 = vector.transfer_read %8[%22, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8>
%24 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2)
%25 = vector.transfer_read %8[%24, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8>
%26 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2)
%27 = vector.transfer_read %8[%26, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8>
%28 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8>
%29 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6)
%30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8>
%31 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6)
%32 = vector.transfer_read %9[%31, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8>
%33 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6)
%34 = vector.transfer_read %9[%33, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8>
%35 = vector.transfer_read %arg7[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32>
%36 = vector.transfer_read %arg7[%c1, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32>
%37 = vector.transfer_read %arg7[%c2, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32>
%38 = vector.transfer_read %arg7[%c3, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32>
%39 = arith.extsi %21 : vector<1x4xi8> to vector<1x4xi32>
%40 = arith.extsi %23 : vector<1x4xi8> to vector<1x4xi32>
%41 = arith.extsi %25 : vector<1x4xi8> to vector<1x4xi32>
%42 = arith.extsi %27 : vector<1x4xi8> to vector<1x4xi32>
%43 = arith.extsi %28 : vector<1x4xi8> to vector<1x4xi32>
%44 = vector.insert_strided_slice %43, %cst {offsets = [0, 0], strides = [1, 1]} : vector<1x4xi32> into vector<4x4xi32>
%45 = arith.extsi %30 : vector<1x4xi8> to vector<1x4xi32>
%46 = vector.insert_strided_slice %45, %44 {offsets = [1, 0], strides = [1, 1]} : vector<1x4xi32> into vector<4x4xi32>
%47 = arith.extsi %32 : vector<1x4xi8> to vector<1x4xi32>
%48 = vector.insert_strided_slice %47, %46 {offsets = [2, 0], strides = [1, 1]} : vector<1x4xi32> into vector<4x4xi32>
%49 = arith.extsi %34 : vector<1x4xi8> to vector<1x4xi32>
%50 = vector.insert_strided_slice %49, %48 {offsets = [3, 0], strides = [1, 1]} : vector<1x4xi32> into vector<4x4xi32>
%51 = vector.extract_strided_slice %50 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32>
%52 = vector.extract_strided_slice %35 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%53 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %51, %52 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%54 = vector.extract_strided_slice %50 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32>
%55 = vector.extract_strided_slice %35 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%56 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %54, %55 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%57 = vector.extract_strided_slice %50 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32>
%58 = vector.extract_strided_slice %35 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %57, %58 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%60 = vector.extract_strided_slice %50 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32>
%61 = vector.extract_strided_slice %35 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %60, %61 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%63 = vector.extract_strided_slice %50 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32>
%64 = vector.extract_strided_slice %36 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%65 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %63, %64 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%66 = vector.extract_strided_slice %50 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32>
%67 = vector.extract_strided_slice %36 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%68 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %66, %67 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%69 = vector.extract_strided_slice %50 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32>
%70 = vector.extract_strided_slice %36 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%71 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %69, %70 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%72 = vector.extract_strided_slice %50 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32>
%73 = vector.extract_strided_slice %36 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%74 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %72, %73 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%75 = vector.extract_strided_slice %50 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32>
%76 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%77 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %75, %76 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%78 = vector.extract_strided_slice %50 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32>
%79 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%80 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %78, %79 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%81 = vector.extract_strided_slice %50 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32>
%82 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%83 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %81, %82 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%84 = vector.extract_strided_slice %50 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32>
%85 = vector.extract_strided_slice %37 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%86 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %84, %85 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%87 = vector.extract_strided_slice %50 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32>
%88 = vector.extract_strided_slice %38 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%89 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %87, %88 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%90 = vector.extract_strided_slice %50 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32>
%91 = vector.extract_strided_slice %38 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%92 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %90, %91 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%93 = vector.extract_strided_slice %50 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32>
%94 = vector.extract_strided_slice %38 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%95 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %93, %94 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%96 = vector.extract_strided_slice %50 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi32> to vector<4x1xi32>
%97 = vector.extract_strided_slice %38 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%98 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %96, %97 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%99 = vector.insert_strided_slice %53, %cst {offsets = [0, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%100 = vector.insert_strided_slice %56, %99 {offsets = [0, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%101 = vector.insert_strided_slice %59, %100 {offsets = [0, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%102 = vector.insert_strided_slice %62, %101 {offsets = [0, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%103 = vector.insert_strided_slice %65, %102 {offsets = [1, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%104 = vector.insert_strided_slice %68, %103 {offsets = [1, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%105 = vector.insert_strided_slice %71, %104 {offsets = [1, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%106 = vector.insert_strided_slice %74, %105 {offsets = [1, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%107 = vector.insert_strided_slice %77, %106 {offsets = [2, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%108 = vector.insert_strided_slice %80, %107 {offsets = [2, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%109 = vector.insert_strided_slice %83, %108 {offsets = [2, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%110 = vector.insert_strided_slice %86, %109 {offsets = [2, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%111 = vector.insert_strided_slice %89, %110 {offsets = [3, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%112 = vector.insert_strided_slice %92, %111 {offsets = [3, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%113 = vector.insert_strided_slice %95, %112 {offsets = [3, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%114 = vector.insert_strided_slice %98, %113 {offsets = [3, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%115 = vector.extract_strided_slice %114 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%116 = vector.transfer_write %115, %arg7[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%117 = vector.extract_strided_slice %114 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%118 = vector.transfer_write %117, %116[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%119 = vector.extract_strided_slice %114 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%120 = vector.transfer_write %119, %118[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%121 = vector.extract_strided_slice %114 {offsets = [3, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%122 = vector.transfer_write %121, %120[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
scf.yield %122 : tensor<4x4xi32>
}
%inserted_slice = tensor.insert_slice %20 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %11 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
--- After Jakub cleanup extract/insert ---
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%cst = arith.constant dense<0> : vector<4x4xi8>
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%cst_0 = arith.constant dense<0> : vector<4x4xi32>
%c0_i8 = arith.constant 0 : i8
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) {
%11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%12 = vector.extract_strided_slice %cst_0 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%13 = vector.transfer_write %12, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%14 = vector.extract_strided_slice %cst_0 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%15 = vector.transfer_write %14, %13[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%16 = vector.extract_strided_slice %cst_0 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%17 = vector.transfer_write %16, %15[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%18 = vector.extract_strided_slice %cst_0 {offsets = [3, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%19 = vector.transfer_write %18, %17[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%20 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %19) -> (tensor<4x4xi32>) {
%21 = vector.transfer_read %8[%arg2, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8>
%22 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
%23 = vector.transfer_read %8[%22, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8>
%24 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2)
%25 = vector.transfer_read %8[%24, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8>
%26 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2)
%27 = vector.transfer_read %8[%26, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8>
%28 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8>
%29 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6)
%30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8>
%31 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6)
%32 = vector.transfer_read %9[%31, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8>
%33 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6)
%34 = vector.transfer_read %9[%33, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8>
%35 = vector.transfer_read %arg7[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32>
%36 = vector.transfer_read %arg7[%c1, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32>
%37 = vector.transfer_read %arg7[%c2, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32>
%38 = vector.transfer_read %arg7[%c3, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32>
%39 = arith.extsi %21 : vector<1x4xi8> to vector<1x4xi32>
%40 = arith.extsi %23 : vector<1x4xi8> to vector<1x4xi32>
%41 = arith.extsi %25 : vector<1x4xi8> to vector<1x4xi32>
%42 = arith.extsi %27 : vector<1x4xi8> to vector<1x4xi32>
%43 = vector.insert_strided_slice %28, %cst {offsets = [0, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8>
%44 = vector.insert_strided_slice %30, %43 {offsets = [1, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8>
%45 = vector.insert_strided_slice %32, %44 {offsets = [2, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8>
%46 = vector.insert_strided_slice %34, %45 {offsets = [3, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8>
%47 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%48 = arith.extsi %47 : vector<4x1xi8> to vector<4x1xi32>
%49 = vector.extract_strided_slice %35 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %48, %49 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%51 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%52 = arith.extsi %51 : vector<4x1xi8> to vector<4x1xi32>
%53 = vector.extract_strided_slice %35 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %52, %53 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%55 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%56 = arith.extsi %55 : vector<4x1xi8> to vector<4x1xi32>
%57 = vector.extract_strided_slice %35 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%58 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %56, %57 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%59 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%60 = arith.extsi %59 : vector<4x1xi8> to vector<4x1xi32>
%61 = vector.extract_strided_slice %35 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %60, %61 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%63 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%64 = arith.extsi %63 : vector<4x1xi8> to vector<4x1xi32>
%65 = vector.extract_strided_slice %36 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%66 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %64, %65 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%67 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%68 = arith.extsi %67 : vector<4x1xi8> to vector<4x1xi32>
%69 = vector.extract_strided_slice %36 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%70 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %68, %69 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%71 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%72 = arith.extsi %71 : vector<4x1xi8> to vector<4x1xi32>
%73 = vector.extract_strided_slice %36 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%74 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %72, %73 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%75 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%76 = arith.extsi %75 : vector<4x1xi8> to vector<4x1xi32>
%77 = vector.extract_strided_slice %36 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%78 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %76, %77 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%79 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%80 = arith.extsi %79 : vector<4x1xi8> to vector<4x1xi32>
%81 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%82 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %80, %81 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%83 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%84 = arith.extsi %83 : vector<4x1xi8> to vector<4x1xi32>
%85 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%86 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %84, %85 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%87 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%88 = arith.extsi %87 : vector<4x1xi8> to vector<4x1xi32>
%89 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%90 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %88, %89 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%91 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%92 = arith.extsi %91 : vector<4x1xi8> to vector<4x1xi32>
%93 = vector.extract_strided_slice %37 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%94 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %92, %93 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%95 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%96 = arith.extsi %95 : vector<4x1xi8> to vector<4x1xi32>
%97 = vector.extract_strided_slice %38 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%98 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %96, %97 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%99 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%100 = arith.extsi %99 : vector<4x1xi8> to vector<4x1xi32>
%101 = vector.extract_strided_slice %38 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%102 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %100, %101 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%103 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%104 = arith.extsi %103 : vector<4x1xi8> to vector<4x1xi32>
%105 = vector.extract_strided_slice %38 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%106 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %104, %105 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%107 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%108 = arith.extsi %107 : vector<4x1xi8> to vector<4x1xi32>
%109 = vector.extract_strided_slice %38 {offsets = [0, 3], sizes = [1, 1], strides = [1, 1]} : vector<1x4xi32> to vector<1x1xi32>
%110 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %108, %109 : vector<1x4xi32>, vector<4x1xi32> into vector<1x1xi32>
%111 = vector.insert_strided_slice %50, %cst_0 {offsets = [0, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%112 = vector.insert_strided_slice %54, %111 {offsets = [0, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%113 = vector.insert_strided_slice %58, %112 {offsets = [0, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%114 = vector.insert_strided_slice %62, %113 {offsets = [0, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%115 = vector.insert_strided_slice %66, %114 {offsets = [1, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%116 = vector.insert_strided_slice %70, %115 {offsets = [1, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%117 = vector.insert_strided_slice %74, %116 {offsets = [1, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%118 = vector.insert_strided_slice %78, %117 {offsets = [1, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%119 = vector.insert_strided_slice %82, %118 {offsets = [2, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%120 = vector.insert_strided_slice %86, %119 {offsets = [2, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%121 = vector.insert_strided_slice %90, %120 {offsets = [2, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%122 = vector.insert_strided_slice %94, %121 {offsets = [2, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%123 = vector.insert_strided_slice %98, %122 {offsets = [3, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%124 = vector.insert_strided_slice %102, %123 {offsets = [3, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%125 = vector.insert_strided_slice %106, %124 {offsets = [3, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%126 = vector.insert_strided_slice %110, %125 {offsets = [3, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%127 = vector.extract_strided_slice %126 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%128 = vector.transfer_write %127, %arg7[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%129 = vector.extract_strided_slice %126 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%130 = vector.transfer_write %129, %128[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%131 = vector.extract_strided_slice %126 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%132 = vector.transfer_write %131, %130[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%133 = vector.extract_strided_slice %126 {offsets = [3, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%134 = vector.transfer_write %133, %132[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
scf.yield %134 : tensor<4x4xi32>
}
%inserted_slice = tensor.insert_slice %20 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %11 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
--- After lowering size-1 reduction contract ops ---
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%cst = arith.constant dense<0> : vector<1x1xi32>
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<4xi32>
%cst_2 = arith.constant dense<0> : vector<4x4xi8>
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%cst_3 = arith.constant dense<0> : vector<4x4xi32>
%c0_i8 = arith.constant 0 : i8
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) {
%11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%12 = vector.extract_strided_slice %cst_3 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%13 = vector.transfer_write %12, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%14 = vector.extract_strided_slice %cst_3 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%15 = vector.transfer_write %14, %13[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%16 = vector.extract_strided_slice %cst_3 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%17 = vector.transfer_write %16, %15[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%18 = vector.extract_strided_slice %cst_3 {offsets = [3, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%19 = vector.transfer_write %18, %17[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%20 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %19) -> (tensor<4x4xi32>) {
%21 = vector.transfer_read %8[%arg2, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8>
%22 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
%23 = vector.transfer_read %8[%22, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8>
%24 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2)
%25 = vector.transfer_read %8[%24, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8>
%26 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2)
%27 = vector.transfer_read %8[%26, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8>
%28 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8>
%29 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6)
%30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8>
%31 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6)
%32 = vector.transfer_read %9[%31, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8>
%33 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6)
%34 = vector.transfer_read %9[%33, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8>
%35 = vector.transfer_read %arg7[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32>
%36 = vector.transfer_read %arg7[%c1, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32>
%37 = vector.transfer_read %arg7[%c2, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32>
%38 = vector.transfer_read %arg7[%c3, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32>
%39 = arith.extsi %21 : vector<1x4xi8> to vector<1x4xi32>
%40 = arith.extsi %23 : vector<1x4xi8> to vector<1x4xi32>
%41 = arith.extsi %25 : vector<1x4xi8> to vector<1x4xi32>
%42 = arith.extsi %27 : vector<1x4xi8> to vector<1x4xi32>
%43 = vector.insert_strided_slice %28, %cst_2 {offsets = [0, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8>
%44 = vector.insert_strided_slice %30, %43 {offsets = [1, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8>
%45 = vector.insert_strided_slice %32, %44 {offsets = [2, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8>
%46 = vector.insert_strided_slice %34, %45 {offsets = [3, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8>
%47 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%48 = arith.extsi %47 : vector<4x1xi8> to vector<4x1xi32>
%49 = vector.extract %39[0] : vector<1x4xi32>
%50 = vector.extract %48[0, 0] : vector<4x1xi32>
%51 = vector.insert %50, %cst_1 [0] : i32 into vector<4xi32>
%52 = vector.extract %48[1, 0] : vector<4x1xi32>
%53 = vector.insert %52, %51 [1] : i32 into vector<4xi32>
%54 = vector.extract %48[2, 0] : vector<4x1xi32>
%55 = vector.insert %54, %53 [2] : i32 into vector<4xi32>
%56 = vector.extract %48[3, 0] : vector<4x1xi32>
%57 = vector.insert %56, %55 [3] : i32 into vector<4xi32>
%58 = vector.extract %35[0, 0] : vector<1x4xi32>
%59 = arith.muli %49, %57 : vector<4xi32>
%60 = vector.reduction <add>, %59, %58 : vector<4xi32> into i32
%61 = vector.insert %60, %cst_0 [0] : i32 into vector<1xi32>
%62 = vector.insert %61, %cst [0] : vector<1xi32> into vector<1x1xi32>
%63 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%64 = arith.extsi %63 : vector<4x1xi8> to vector<4x1xi32>
%65 = vector.extract %39[0] : vector<1x4xi32>
%66 = vector.extract %64[0, 0] : vector<4x1xi32>
%67 = vector.insert %66, %cst_1 [0] : i32 into vector<4xi32>
%68 = vector.extract %64[1, 0] : vector<4x1xi32>
%69 = vector.insert %68, %67 [1] : i32 into vector<4xi32>
%70 = vector.extract %64[2, 0] : vector<4x1xi32>
%71 = vector.insert %70, %69 [2] : i32 into vector<4xi32>
%72 = vector.extract %64[3, 0] : vector<4x1xi32>
%73 = vector.insert %72, %71 [3] : i32 into vector<4xi32>
%74 = vector.extract %35[0, 1] : vector<1x4xi32>
%75 = arith.muli %65, %73 : vector<4xi32>
%76 = vector.reduction <add>, %75, %74 : vector<4xi32> into i32
%77 = vector.insert %76, %cst_0 [0] : i32 into vector<1xi32>
%78 = vector.insert %77, %cst [0] : vector<1xi32> into vector<1x1xi32>
%79 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%80 = arith.extsi %79 : vector<4x1xi8> to vector<4x1xi32>
%81 = vector.extract %39[0] : vector<1x4xi32>
%82 = vector.extract %80[0, 0] : vector<4x1xi32>
%83 = vector.insert %82, %cst_1 [0] : i32 into vector<4xi32>
%84 = vector.extract %80[1, 0] : vector<4x1xi32>
%85 = vector.insert %84, %83 [1] : i32 into vector<4xi32>
%86 = vector.extract %80[2, 0] : vector<4x1xi32>
%87 = vector.insert %86, %85 [2] : i32 into vector<4xi32>
%88 = vector.extract %80[3, 0] : vector<4x1xi32>
%89 = vector.insert %88, %87 [3] : i32 into vector<4xi32>
%90 = vector.extract %35[0, 2] : vector<1x4xi32>
%91 = arith.muli %81, %89 : vector<4xi32>
%92 = vector.reduction <add>, %91, %90 : vector<4xi32> into i32
%93 = vector.insert %92, %cst_0 [0] : i32 into vector<1xi32>
%94 = vector.insert %93, %cst [0] : vector<1xi32> into vector<1x1xi32>
%95 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%96 = arith.extsi %95 : vector<4x1xi8> to vector<4x1xi32>
%97 = vector.extract %39[0] : vector<1x4xi32>
%98 = vector.extract %96[0, 0] : vector<4x1xi32>
%99 = vector.insert %98, %cst_1 [0] : i32 into vector<4xi32>
%100 = vector.extract %96[1, 0] : vector<4x1xi32>
%101 = vector.insert %100, %99 [1] : i32 into vector<4xi32>
%102 = vector.extract %96[2, 0] : vector<4x1xi32>
%103 = vector.insert %102, %101 [2] : i32 into vector<4xi32>
%104 = vector.extract %96[3, 0] : vector<4x1xi32>
%105 = vector.insert %104, %103 [3] : i32 into vector<4xi32>
%106 = vector.extract %35[0, 3] : vector<1x4xi32>
%107 = arith.muli %97, %105 : vector<4xi32>
%108 = vector.reduction <add>, %107, %106 : vector<4xi32> into i32
%109 = vector.insert %108, %cst_0 [0] : i32 into vector<1xi32>
%110 = vector.insert %109, %cst [0] : vector<1xi32> into vector<1x1xi32>
%111 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%112 = arith.extsi %111 : vector<4x1xi8> to vector<4x1xi32>
%113 = vector.extract %40[0] : vector<1x4xi32>
%114 = vector.extract %112[0, 0] : vector<4x1xi32>
%115 = vector.insert %114, %cst_1 [0] : i32 into vector<4xi32>
%116 = vector.extract %112[1, 0] : vector<4x1xi32>
%117 = vector.insert %116, %115 [1] : i32 into vector<4xi32>
%118 = vector.extract %112[2, 0] : vector<4x1xi32>
%119 = vector.insert %118, %117 [2] : i32 into vector<4xi32>
%120 = vector.extract %112[3, 0] : vector<4x1xi32>
%121 = vector.insert %120, %119 [3] : i32 into vector<4xi32>
%122 = vector.extract %36[0, 0] : vector<1x4xi32>
%123 = arith.muli %113, %121 : vector<4xi32>
%124 = vector.reduction <add>, %123, %122 : vector<4xi32> into i32
%125 = vector.insert %124, %cst_0 [0] : i32 into vector<1xi32>
%126 = vector.insert %125, %cst [0] : vector<1xi32> into vector<1x1xi32>
%127 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%128 = arith.extsi %127 : vector<4x1xi8> to vector<4x1xi32>
%129 = vector.extract %40[0] : vector<1x4xi32>
%130 = vector.extract %128[0, 0] : vector<4x1xi32>
%131 = vector.insert %130, %cst_1 [0] : i32 into vector<4xi32>
%132 = vector.extract %128[1, 0] : vector<4x1xi32>
%133 = vector.insert %132, %131 [1] : i32 into vector<4xi32>
%134 = vector.extract %128[2, 0] : vector<4x1xi32>
%135 = vector.insert %134, %133 [2] : i32 into vector<4xi32>
%136 = vector.extract %128[3, 0] : vector<4x1xi32>
%137 = vector.insert %136, %135 [3] : i32 into vector<4xi32>
%138 = vector.extract %36[0, 1] : vector<1x4xi32>
%139 = arith.muli %129, %137 : vector<4xi32>
%140 = vector.reduction <add>, %139, %138 : vector<4xi32> into i32
%141 = vector.insert %140, %cst_0 [0] : i32 into vector<1xi32>
%142 = vector.insert %141, %cst [0] : vector<1xi32> into vector<1x1xi32>
%143 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%144 = arith.extsi %143 : vector<4x1xi8> to vector<4x1xi32>
%145 = vector.extract %40[0] : vector<1x4xi32>
%146 = vector.extract %144[0, 0] : vector<4x1xi32>
%147 = vector.insert %146, %cst_1 [0] : i32 into vector<4xi32>
%148 = vector.extract %144[1, 0] : vector<4x1xi32>
%149 = vector.insert %148, %147 [1] : i32 into vector<4xi32>
%150 = vector.extract %144[2, 0] : vector<4x1xi32>
%151 = vector.insert %150, %149 [2] : i32 into vector<4xi32>
%152 = vector.extract %144[3, 0] : vector<4x1xi32>
%153 = vector.insert %152, %151 [3] : i32 into vector<4xi32>
%154 = vector.extract %36[0, 2] : vector<1x4xi32>
%155 = arith.muli %145, %153 : vector<4xi32>
%156 = vector.reduction <add>, %155, %154 : vector<4xi32> into i32
%157 = vector.insert %156, %cst_0 [0] : i32 into vector<1xi32>
%158 = vector.insert %157, %cst [0] : vector<1xi32> into vector<1x1xi32>
%159 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%160 = arith.extsi %159 : vector<4x1xi8> to vector<4x1xi32>
%161 = vector.extract %40[0] : vector<1x4xi32>
%162 = vector.extract %160[0, 0] : vector<4x1xi32>
%163 = vector.insert %162, %cst_1 [0] : i32 into vector<4xi32>
%164 = vector.extract %160[1, 0] : vector<4x1xi32>
%165 = vector.insert %164, %163 [1] : i32 into vector<4xi32>
%166 = vector.extract %160[2, 0] : vector<4x1xi32>
%167 = vector.insert %166, %165 [2] : i32 into vector<4xi32>
%168 = vector.extract %160[3, 0] : vector<4x1xi32>
%169 = vector.insert %168, %167 [3] : i32 into vector<4xi32>
%170 = vector.extract %36[0, 3] : vector<1x4xi32>
%171 = arith.muli %161, %169 : vector<4xi32>
%172 = vector.reduction <add>, %171, %170 : vector<4xi32> into i32
%173 = vector.insert %172, %cst_0 [0] : i32 into vector<1xi32>
%174 = vector.insert %173, %cst [0] : vector<1xi32> into vector<1x1xi32>
%175 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%176 = arith.extsi %175 : vector<4x1xi8> to vector<4x1xi32>
%177 = vector.extract %41[0] : vector<1x4xi32>
%178 = vector.extract %176[0, 0] : vector<4x1xi32>
%179 = vector.insert %178, %cst_1 [0] : i32 into vector<4xi32>
%180 = vector.extract %176[1, 0] : vector<4x1xi32>
%181 = vector.insert %180, %179 [1] : i32 into vector<4xi32>
%182 = vector.extract %176[2, 0] : vector<4x1xi32>
%183 = vector.insert %182, %181 [2] : i32 into vector<4xi32>
%184 = vector.extract %176[3, 0] : vector<4x1xi32>
%185 = vector.insert %184, %183 [3] : i32 into vector<4xi32>
%186 = vector.extract %37[0, 0] : vector<1x4xi32>
%187 = arith.muli %177, %185 : vector<4xi32>
%188 = vector.reduction <add>, %187, %186 : vector<4xi32> into i32
%189 = vector.insert %188, %cst_0 [0] : i32 into vector<1xi32>
%190 = vector.insert %189, %cst [0] : vector<1xi32> into vector<1x1xi32>
%191 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%192 = arith.extsi %191 : vector<4x1xi8> to vector<4x1xi32>
%193 = vector.extract %41[0] : vector<1x4xi32>
%194 = vector.extract %192[0, 0] : vector<4x1xi32>
%195 = vector.insert %194, %cst_1 [0] : i32 into vector<4xi32>
%196 = vector.extract %192[1, 0] : vector<4x1xi32>
%197 = vector.insert %196, %195 [1] : i32 into vector<4xi32>
%198 = vector.extract %192[2, 0] : vector<4x1xi32>
%199 = vector.insert %198, %197 [2] : i32 into vector<4xi32>
%200 = vector.extract %192[3, 0] : vector<4x1xi32>
%201 = vector.insert %200, %199 [3] : i32 into vector<4xi32>
%202 = vector.extract %37[0, 1] : vector<1x4xi32>
%203 = arith.muli %193, %201 : vector<4xi32>
%204 = vector.reduction <add>, %203, %202 : vector<4xi32> into i32
%205 = vector.insert %204, %cst_0 [0] : i32 into vector<1xi32>
%206 = vector.insert %205, %cst [0] : vector<1xi32> into vector<1x1xi32>
%207 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%208 = arith.extsi %207 : vector<4x1xi8> to vector<4x1xi32>
%209 = vector.extract %41[0] : vector<1x4xi32>
%210 = vector.extract %208[0, 0] : vector<4x1xi32>
%211 = vector.insert %210, %cst_1 [0] : i32 into vector<4xi32>
%212 = vector.extract %208[1, 0] : vector<4x1xi32>
%213 = vector.insert %212, %211 [1] : i32 into vector<4xi32>
%214 = vector.extract %208[2, 0] : vector<4x1xi32>
%215 = vector.insert %214, %213 [2] : i32 into vector<4xi32>
%216 = vector.extract %208[3, 0] : vector<4x1xi32>
%217 = vector.insert %216, %215 [3] : i32 into vector<4xi32>
%218 = vector.extract %37[0, 2] : vector<1x4xi32>
%219 = arith.muli %209, %217 : vector<4xi32>
%220 = vector.reduction <add>, %219, %218 : vector<4xi32> into i32
%221 = vector.insert %220, %cst_0 [0] : i32 into vector<1xi32>
%222 = vector.insert %221, %cst [0] : vector<1xi32> into vector<1x1xi32>
%223 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%224 = arith.extsi %223 : vector<4x1xi8> to vector<4x1xi32>
%225 = vector.extract %41[0] : vector<1x4xi32>
%226 = vector.extract %224[0, 0] : vector<4x1xi32>
%227 = vector.insert %226, %cst_1 [0] : i32 into vector<4xi32>
%228 = vector.extract %224[1, 0] : vector<4x1xi32>
%229 = vector.insert %228, %227 [1] : i32 into vector<4xi32>
%230 = vector.extract %224[2, 0] : vector<4x1xi32>
%231 = vector.insert %230, %229 [2] : i32 into vector<4xi32>
%232 = vector.extract %224[3, 0] : vector<4x1xi32>
%233 = vector.insert %232, %231 [3] : i32 into vector<4xi32>
%234 = vector.extract %37[0, 3] : vector<1x4xi32>
%235 = arith.muli %225, %233 : vector<4xi32>
%236 = vector.reduction <add>, %235, %234 : vector<4xi32> into i32
%237 = vector.insert %236, %cst_0 [0] : i32 into vector<1xi32>
%238 = vector.insert %237, %cst [0] : vector<1xi32> into vector<1x1xi32>
%239 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%240 = arith.extsi %239 : vector<4x1xi8> to vector<4x1xi32>
%241 = vector.extract %42[0] : vector<1x4xi32>
%242 = vector.extract %240[0, 0] : vector<4x1xi32>
%243 = vector.insert %242, %cst_1 [0] : i32 into vector<4xi32>
%244 = vector.extract %240[1, 0] : vector<4x1xi32>
%245 = vector.insert %244, %243 [1] : i32 into vector<4xi32>
%246 = vector.extract %240[2, 0] : vector<4x1xi32>
%247 = vector.insert %246, %245 [2] : i32 into vector<4xi32>
%248 = vector.extract %240[3, 0] : vector<4x1xi32>
%249 = vector.insert %248, %247 [3] : i32 into vector<4xi32>
%250 = vector.extract %38[0, 0] : vector<1x4xi32>
%251 = arith.muli %241, %249 : vector<4xi32>
%252 = vector.reduction <add>, %251, %250 : vector<4xi32> into i32
%253 = vector.insert %252, %cst_0 [0] : i32 into vector<1xi32>
%254 = vector.insert %253, %cst [0] : vector<1xi32> into vector<1x1xi32>
%255 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%256 = arith.extsi %255 : vector<4x1xi8> to vector<4x1xi32>
%257 = vector.extract %42[0] : vector<1x4xi32>
%258 = vector.extract %256[0, 0] : vector<4x1xi32>
%259 = vector.insert %258, %cst_1 [0] : i32 into vector<4xi32>
%260 = vector.extract %256[1, 0] : vector<4x1xi32>
%261 = vector.insert %260, %259 [1] : i32 into vector<4xi32>
%262 = vector.extract %256[2, 0] : vector<4x1xi32>
%263 = vector.insert %262, %261 [2] : i32 into vector<4xi32>
%264 = vector.extract %256[3, 0] : vector<4x1xi32>
%265 = vector.insert %264, %263 [3] : i32 into vector<4xi32>
%266 = vector.extract %38[0, 1] : vector<1x4xi32>
%267 = arith.muli %257, %265 : vector<4xi32>
%268 = vector.reduction <add>, %267, %266 : vector<4xi32> into i32
%269 = vector.insert %268, %cst_0 [0] : i32 into vector<1xi32>
%270 = vector.insert %269, %cst [0] : vector<1xi32> into vector<1x1xi32>
%271 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%272 = arith.extsi %271 : vector<4x1xi8> to vector<4x1xi32>
%273 = vector.extract %42[0] : vector<1x4xi32>
%274 = vector.extract %272[0, 0] : vector<4x1xi32>
%275 = vector.insert %274, %cst_1 [0] : i32 into vector<4xi32>
%276 = vector.extract %272[1, 0] : vector<4x1xi32>
%277 = vector.insert %276, %275 [1] : i32 into vector<4xi32>
%278 = vector.extract %272[2, 0] : vector<4x1xi32>
%279 = vector.insert %278, %277 [2] : i32 into vector<4xi32>
%280 = vector.extract %272[3, 0] : vector<4x1xi32>
%281 = vector.insert %280, %279 [3] : i32 into vector<4xi32>
%282 = vector.extract %38[0, 2] : vector<1x4xi32>
%283 = arith.muli %273, %281 : vector<4xi32>
%284 = vector.reduction <add>, %283, %282 : vector<4xi32> into i32
%285 = vector.insert %284, %cst_0 [0] : i32 into vector<1xi32>
%286 = vector.insert %285, %cst [0] : vector<1xi32> into vector<1x1xi32>
%287 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%288 = arith.extsi %287 : vector<4x1xi8> to vector<4x1xi32>
%289 = vector.extract %42[0] : vector<1x4xi32>
%290 = vector.extract %288[0, 0] : vector<4x1xi32>
%291 = vector.insert %290, %cst_1 [0] : i32 into vector<4xi32>
%292 = vector.extract %288[1, 0] : vector<4x1xi32>
%293 = vector.insert %292, %291 [1] : i32 into vector<4xi32>
%294 = vector.extract %288[2, 0] : vector<4x1xi32>
%295 = vector.insert %294, %293 [2] : i32 into vector<4xi32>
%296 = vector.extract %288[3, 0] : vector<4x1xi32>
%297 = vector.insert %296, %295 [3] : i32 into vector<4xi32>
%298 = vector.extract %38[0, 3] : vector<1x4xi32>
%299 = arith.muli %289, %297 : vector<4xi32>
%300 = vector.reduction <add>, %299, %298 : vector<4xi32> into i32
%301 = vector.insert %300, %cst_0 [0] : i32 into vector<1xi32>
%302 = vector.insert %301, %cst [0] : vector<1xi32> into vector<1x1xi32>
%303 = vector.insert_strided_slice %62, %cst_3 {offsets = [0, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%304 = vector.insert_strided_slice %78, %303 {offsets = [0, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%305 = vector.insert_strided_slice %94, %304 {offsets = [0, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%306 = vector.insert_strided_slice %110, %305 {offsets = [0, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%307 = vector.insert_strided_slice %126, %306 {offsets = [1, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%308 = vector.insert_strided_slice %142, %307 {offsets = [1, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%309 = vector.insert_strided_slice %158, %308 {offsets = [1, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%310 = vector.insert_strided_slice %174, %309 {offsets = [1, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%311 = vector.insert_strided_slice %190, %310 {offsets = [2, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%312 = vector.insert_strided_slice %206, %311 {offsets = [2, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%313 = vector.insert_strided_slice %222, %312 {offsets = [2, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%314 = vector.insert_strided_slice %238, %313 {offsets = [2, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%315 = vector.insert_strided_slice %254, %314 {offsets = [3, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%316 = vector.insert_strided_slice %270, %315 {offsets = [3, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%317 = vector.insert_strided_slice %286, %316 {offsets = [3, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%318 = vector.insert_strided_slice %302, %317 {offsets = [3, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%319 = vector.extract_strided_slice %318 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%320 = vector.transfer_write %319, %arg7[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%321 = vector.extract_strided_slice %318 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%322 = vector.transfer_write %321, %320[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%323 = vector.extract_strided_slice %318 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%324 = vector.transfer_write %323, %322[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%325 = vector.extract_strided_slice %318 {offsets = [3, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%326 = vector.transfer_write %325, %324[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
scf.yield %326 : tensor<4x4xi32>
}
%inserted_slice = tensor.insert_slice %20 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %11 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
--- After lowering transpose ops ---
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%cst = arith.constant dense<0> : vector<1x1xi32>
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<4xi32>
%cst_2 = arith.constant dense<0> : vector<4x4xi8>
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%cst_3 = arith.constant dense<0> : vector<4x4xi32>
%c0_i8 = arith.constant 0 : i8
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) {
%11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%12 = vector.extract_strided_slice %cst_3 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%13 = vector.transfer_write %12, %extracted_slice[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%14 = vector.extract_strided_slice %cst_3 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%15 = vector.transfer_write %14, %13[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%16 = vector.extract_strided_slice %cst_3 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%17 = vector.transfer_write %16, %15[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%18 = vector.extract_strided_slice %cst_3 {offsets = [3, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%19 = vector.transfer_write %18, %17[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%20 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %19) -> (tensor<4x4xi32>) {
%21 = vector.transfer_read %8[%arg2, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8>
%22 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
%23 = vector.transfer_read %8[%22, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8>
%24 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2)
%25 = vector.transfer_read %8[%24, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8>
%26 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2)
%27 = vector.transfer_read %8[%26, %arg6], %c0_i8 {in_bounds = [true, true]} : tensor<8x1024xi8>, vector<1x4xi8>
%28 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8>
%29 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6)
%30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8>
%31 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6)
%32 = vector.transfer_read %9[%31, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8>
%33 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6)
%34 = vector.transfer_read %9[%33, %arg4], %c0_i8 {in_bounds = [true, true]} : tensor<1024x32xi8>, vector<1x4xi8>
%35 = vector.transfer_read %arg7[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32>
%36 = vector.transfer_read %arg7[%c1, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32>
%37 = vector.transfer_read %arg7[%c2, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32>
%38 = vector.transfer_read %arg7[%c3, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x4xi32>, vector<1x4xi32>
%39 = arith.extsi %21 : vector<1x4xi8> to vector<1x4xi32>
%40 = arith.extsi %23 : vector<1x4xi8> to vector<1x4xi32>
%41 = arith.extsi %25 : vector<1x4xi8> to vector<1x4xi32>
%42 = arith.extsi %27 : vector<1x4xi8> to vector<1x4xi32>
%43 = vector.insert_strided_slice %28, %cst_2 {offsets = [0, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8>
%44 = vector.insert_strided_slice %30, %43 {offsets = [1, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8>
%45 = vector.insert_strided_slice %32, %44 {offsets = [2, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8>
%46 = vector.insert_strided_slice %34, %45 {offsets = [3, 0], strides = [1, 1]} : vector<1x4xi8> into vector<4x4xi8>
%47 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%48 = arith.extsi %47 : vector<4x1xi8> to vector<4x1xi32>
%49 = vector.extract %39[0] : vector<1x4xi32>
%50 = vector.extract %48[0, 0] : vector<4x1xi32>
%51 = vector.insert %50, %cst_1 [0] : i32 into vector<4xi32>
%52 = vector.extract %48[1, 0] : vector<4x1xi32>
%53 = vector.insert %52, %51 [1] : i32 into vector<4xi32>
%54 = vector.extract %48[2, 0] : vector<4x1xi32>
%55 = vector.insert %54, %53 [2] : i32 into vector<4xi32>
%56 = vector.extract %48[3, 0] : vector<4x1xi32>
%57 = vector.insert %56, %55 [3] : i32 into vector<4xi32>
%58 = vector.extract %35[0, 0] : vector<1x4xi32>
%59 = arith.muli %49, %57 : vector<4xi32>
%60 = vector.reduction <add>, %59, %58 : vector<4xi32> into i32
%61 = vector.insert %60, %cst_0 [0] : i32 into vector<1xi32>
%62 = vector.insert %61, %cst [0] : vector<1xi32> into vector<1x1xi32>
%63 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%64 = arith.extsi %63 : vector<4x1xi8> to vector<4x1xi32>
%65 = vector.extract %39[0] : vector<1x4xi32>
%66 = vector.extract %64[0, 0] : vector<4x1xi32>
%67 = vector.insert %66, %cst_1 [0] : i32 into vector<4xi32>
%68 = vector.extract %64[1, 0] : vector<4x1xi32>
%69 = vector.insert %68, %67 [1] : i32 into vector<4xi32>
%70 = vector.extract %64[2, 0] : vector<4x1xi32>
%71 = vector.insert %70, %69 [2] : i32 into vector<4xi32>
%72 = vector.extract %64[3, 0] : vector<4x1xi32>
%73 = vector.insert %72, %71 [3] : i32 into vector<4xi32>
%74 = vector.extract %35[0, 1] : vector<1x4xi32>
%75 = arith.muli %65, %73 : vector<4xi32>
%76 = vector.reduction <add>, %75, %74 : vector<4xi32> into i32
%77 = vector.insert %76, %cst_0 [0] : i32 into vector<1xi32>
%78 = vector.insert %77, %cst [0] : vector<1xi32> into vector<1x1xi32>
%79 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%80 = arith.extsi %79 : vector<4x1xi8> to vector<4x1xi32>
%81 = vector.extract %39[0] : vector<1x4xi32>
%82 = vector.extract %80[0, 0] : vector<4x1xi32>
%83 = vector.insert %82, %cst_1 [0] : i32 into vector<4xi32>
%84 = vector.extract %80[1, 0] : vector<4x1xi32>
%85 = vector.insert %84, %83 [1] : i32 into vector<4xi32>
%86 = vector.extract %80[2, 0] : vector<4x1xi32>
%87 = vector.insert %86, %85 [2] : i32 into vector<4xi32>
%88 = vector.extract %80[3, 0] : vector<4x1xi32>
%89 = vector.insert %88, %87 [3] : i32 into vector<4xi32>
%90 = vector.extract %35[0, 2] : vector<1x4xi32>
%91 = arith.muli %81, %89 : vector<4xi32>
%92 = vector.reduction <add>, %91, %90 : vector<4xi32> into i32
%93 = vector.insert %92, %cst_0 [0] : i32 into vector<1xi32>
%94 = vector.insert %93, %cst [0] : vector<1xi32> into vector<1x1xi32>
%95 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%96 = arith.extsi %95 : vector<4x1xi8> to vector<4x1xi32>
%97 = vector.extract %39[0] : vector<1x4xi32>
%98 = vector.extract %96[0, 0] : vector<4x1xi32>
%99 = vector.insert %98, %cst_1 [0] : i32 into vector<4xi32>
%100 = vector.extract %96[1, 0] : vector<4x1xi32>
%101 = vector.insert %100, %99 [1] : i32 into vector<4xi32>
%102 = vector.extract %96[2, 0] : vector<4x1xi32>
%103 = vector.insert %102, %101 [2] : i32 into vector<4xi32>
%104 = vector.extract %96[3, 0] : vector<4x1xi32>
%105 = vector.insert %104, %103 [3] : i32 into vector<4xi32>
%106 = vector.extract %35[0, 3] : vector<1x4xi32>
%107 = arith.muli %97, %105 : vector<4xi32>
%108 = vector.reduction <add>, %107, %106 : vector<4xi32> into i32
%109 = vector.insert %108, %cst_0 [0] : i32 into vector<1xi32>
%110 = vector.insert %109, %cst [0] : vector<1xi32> into vector<1x1xi32>
%111 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%112 = arith.extsi %111 : vector<4x1xi8> to vector<4x1xi32>
%113 = vector.extract %40[0] : vector<1x4xi32>
%114 = vector.extract %112[0, 0] : vector<4x1xi32>
%115 = vector.insert %114, %cst_1 [0] : i32 into vector<4xi32>
%116 = vector.extract %112[1, 0] : vector<4x1xi32>
%117 = vector.insert %116, %115 [1] : i32 into vector<4xi32>
%118 = vector.extract %112[2, 0] : vector<4x1xi32>
%119 = vector.insert %118, %117 [2] : i32 into vector<4xi32>
%120 = vector.extract %112[3, 0] : vector<4x1xi32>
%121 = vector.insert %120, %119 [3] : i32 into vector<4xi32>
%122 = vector.extract %36[0, 0] : vector<1x4xi32>
%123 = arith.muli %113, %121 : vector<4xi32>
%124 = vector.reduction <add>, %123, %122 : vector<4xi32> into i32
%125 = vector.insert %124, %cst_0 [0] : i32 into vector<1xi32>
%126 = vector.insert %125, %cst [0] : vector<1xi32> into vector<1x1xi32>
%127 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%128 = arith.extsi %127 : vector<4x1xi8> to vector<4x1xi32>
%129 = vector.extract %40[0] : vector<1x4xi32>
%130 = vector.extract %128[0, 0] : vector<4x1xi32>
%131 = vector.insert %130, %cst_1 [0] : i32 into vector<4xi32>
%132 = vector.extract %128[1, 0] : vector<4x1xi32>
%133 = vector.insert %132, %131 [1] : i32 into vector<4xi32>
%134 = vector.extract %128[2, 0] : vector<4x1xi32>
%135 = vector.insert %134, %133 [2] : i32 into vector<4xi32>
%136 = vector.extract %128[3, 0] : vector<4x1xi32>
%137 = vector.insert %136, %135 [3] : i32 into vector<4xi32>
%138 = vector.extract %36[0, 1] : vector<1x4xi32>
%139 = arith.muli %129, %137 : vector<4xi32>
%140 = vector.reduction <add>, %139, %138 : vector<4xi32> into i32
%141 = vector.insert %140, %cst_0 [0] : i32 into vector<1xi32>
%142 = vector.insert %141, %cst [0] : vector<1xi32> into vector<1x1xi32>
%143 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%144 = arith.extsi %143 : vector<4x1xi8> to vector<4x1xi32>
%145 = vector.extract %40[0] : vector<1x4xi32>
%146 = vector.extract %144[0, 0] : vector<4x1xi32>
%147 = vector.insert %146, %cst_1 [0] : i32 into vector<4xi32>
%148 = vector.extract %144[1, 0] : vector<4x1xi32>
%149 = vector.insert %148, %147 [1] : i32 into vector<4xi32>
%150 = vector.extract %144[2, 0] : vector<4x1xi32>
%151 = vector.insert %150, %149 [2] : i32 into vector<4xi32>
%152 = vector.extract %144[3, 0] : vector<4x1xi32>
%153 = vector.insert %152, %151 [3] : i32 into vector<4xi32>
%154 = vector.extract %36[0, 2] : vector<1x4xi32>
%155 = arith.muli %145, %153 : vector<4xi32>
%156 = vector.reduction <add>, %155, %154 : vector<4xi32> into i32
%157 = vector.insert %156, %cst_0 [0] : i32 into vector<1xi32>
%158 = vector.insert %157, %cst [0] : vector<1xi32> into vector<1x1xi32>
%159 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%160 = arith.extsi %159 : vector<4x1xi8> to vector<4x1xi32>
%161 = vector.extract %40[0] : vector<1x4xi32>
%162 = vector.extract %160[0, 0] : vector<4x1xi32>
%163 = vector.insert %162, %cst_1 [0] : i32 into vector<4xi32>
%164 = vector.extract %160[1, 0] : vector<4x1xi32>
%165 = vector.insert %164, %163 [1] : i32 into vector<4xi32>
%166 = vector.extract %160[2, 0] : vector<4x1xi32>
%167 = vector.insert %166, %165 [2] : i32 into vector<4xi32>
%168 = vector.extract %160[3, 0] : vector<4x1xi32>
%169 = vector.insert %168, %167 [3] : i32 into vector<4xi32>
%170 = vector.extract %36[0, 3] : vector<1x4xi32>
%171 = arith.muli %161, %169 : vector<4xi32>
%172 = vector.reduction <add>, %171, %170 : vector<4xi32> into i32
%173 = vector.insert %172, %cst_0 [0] : i32 into vector<1xi32>
%174 = vector.insert %173, %cst [0] : vector<1xi32> into vector<1x1xi32>
%175 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%176 = arith.extsi %175 : vector<4x1xi8> to vector<4x1xi32>
%177 = vector.extract %41[0] : vector<1x4xi32>
%178 = vector.extract %176[0, 0] : vector<4x1xi32>
%179 = vector.insert %178, %cst_1 [0] : i32 into vector<4xi32>
%180 = vector.extract %176[1, 0] : vector<4x1xi32>
%181 = vector.insert %180, %179 [1] : i32 into vector<4xi32>
%182 = vector.extract %176[2, 0] : vector<4x1xi32>
%183 = vector.insert %182, %181 [2] : i32 into vector<4xi32>
%184 = vector.extract %176[3, 0] : vector<4x1xi32>
%185 = vector.insert %184, %183 [3] : i32 into vector<4xi32>
%186 = vector.extract %37[0, 0] : vector<1x4xi32>
%187 = arith.muli %177, %185 : vector<4xi32>
%188 = vector.reduction <add>, %187, %186 : vector<4xi32> into i32
%189 = vector.insert %188, %cst_0 [0] : i32 into vector<1xi32>
%190 = vector.insert %189, %cst [0] : vector<1xi32> into vector<1x1xi32>
%191 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%192 = arith.extsi %191 : vector<4x1xi8> to vector<4x1xi32>
%193 = vector.extract %41[0] : vector<1x4xi32>
%194 = vector.extract %192[0, 0] : vector<4x1xi32>
%195 = vector.insert %194, %cst_1 [0] : i32 into vector<4xi32>
%196 = vector.extract %192[1, 0] : vector<4x1xi32>
%197 = vector.insert %196, %195 [1] : i32 into vector<4xi32>
%198 = vector.extract %192[2, 0] : vector<4x1xi32>
%199 = vector.insert %198, %197 [2] : i32 into vector<4xi32>
%200 = vector.extract %192[3, 0] : vector<4x1xi32>
%201 = vector.insert %200, %199 [3] : i32 into vector<4xi32>
%202 = vector.extract %37[0, 1] : vector<1x4xi32>
%203 = arith.muli %193, %201 : vector<4xi32>
%204 = vector.reduction <add>, %203, %202 : vector<4xi32> into i32
%205 = vector.insert %204, %cst_0 [0] : i32 into vector<1xi32>
%206 = vector.insert %205, %cst [0] : vector<1xi32> into vector<1x1xi32>
%207 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%208 = arith.extsi %207 : vector<4x1xi8> to vector<4x1xi32>
%209 = vector.extract %41[0] : vector<1x4xi32>
%210 = vector.extract %208[0, 0] : vector<4x1xi32>
%211 = vector.insert %210, %cst_1 [0] : i32 into vector<4xi32>
%212 = vector.extract %208[1, 0] : vector<4x1xi32>
%213 = vector.insert %212, %211 [1] : i32 into vector<4xi32>
%214 = vector.extract %208[2, 0] : vector<4x1xi32>
%215 = vector.insert %214, %213 [2] : i32 into vector<4xi32>
%216 = vector.extract %208[3, 0] : vector<4x1xi32>
%217 = vector.insert %216, %215 [3] : i32 into vector<4xi32>
%218 = vector.extract %37[0, 2] : vector<1x4xi32>
%219 = arith.muli %209, %217 : vector<4xi32>
%220 = vector.reduction <add>, %219, %218 : vector<4xi32> into i32
%221 = vector.insert %220, %cst_0 [0] : i32 into vector<1xi32>
%222 = vector.insert %221, %cst [0] : vector<1xi32> into vector<1x1xi32>
%223 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%224 = arith.extsi %223 : vector<4x1xi8> to vector<4x1xi32>
%225 = vector.extract %41[0] : vector<1x4xi32>
%226 = vector.extract %224[0, 0] : vector<4x1xi32>
%227 = vector.insert %226, %cst_1 [0] : i32 into vector<4xi32>
%228 = vector.extract %224[1, 0] : vector<4x1xi32>
%229 = vector.insert %228, %227 [1] : i32 into vector<4xi32>
%230 = vector.extract %224[2, 0] : vector<4x1xi32>
%231 = vector.insert %230, %229 [2] : i32 into vector<4xi32>
%232 = vector.extract %224[3, 0] : vector<4x1xi32>
%233 = vector.insert %232, %231 [3] : i32 into vector<4xi32>
%234 = vector.extract %37[0, 3] : vector<1x4xi32>
%235 = arith.muli %225, %233 : vector<4xi32>
%236 = vector.reduction <add>, %235, %234 : vector<4xi32> into i32
%237 = vector.insert %236, %cst_0 [0] : i32 into vector<1xi32>
%238 = vector.insert %237, %cst [0] : vector<1xi32> into vector<1x1xi32>
%239 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%240 = arith.extsi %239 : vector<4x1xi8> to vector<4x1xi32>
%241 = vector.extract %42[0] : vector<1x4xi32>
%242 = vector.extract %240[0, 0] : vector<4x1xi32>
%243 = vector.insert %242, %cst_1 [0] : i32 into vector<4xi32>
%244 = vector.extract %240[1, 0] : vector<4x1xi32>
%245 = vector.insert %244, %243 [1] : i32 into vector<4xi32>
%246 = vector.extract %240[2, 0] : vector<4x1xi32>
%247 = vector.insert %246, %245 [2] : i32 into vector<4xi32>
%248 = vector.extract %240[3, 0] : vector<4x1xi32>
%249 = vector.insert %248, %247 [3] : i32 into vector<4xi32>
%250 = vector.extract %38[0, 0] : vector<1x4xi32>
%251 = arith.muli %241, %249 : vector<4xi32>
%252 = vector.reduction <add>, %251, %250 : vector<4xi32> into i32
%253 = vector.insert %252, %cst_0 [0] : i32 into vector<1xi32>
%254 = vector.insert %253, %cst [0] : vector<1xi32> into vector<1x1xi32>
%255 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%256 = arith.extsi %255 : vector<4x1xi8> to vector<4x1xi32>
%257 = vector.extract %42[0] : vector<1x4xi32>
%258 = vector.extract %256[0, 0] : vector<4x1xi32>
%259 = vector.insert %258, %cst_1 [0] : i32 into vector<4xi32>
%260 = vector.extract %256[1, 0] : vector<4x1xi32>
%261 = vector.insert %260, %259 [1] : i32 into vector<4xi32>
%262 = vector.extract %256[2, 0] : vector<4x1xi32>
%263 = vector.insert %262, %261 [2] : i32 into vector<4xi32>
%264 = vector.extract %256[3, 0] : vector<4x1xi32>
%265 = vector.insert %264, %263 [3] : i32 into vector<4xi32>
%266 = vector.extract %38[0, 1] : vector<1x4xi32>
%267 = arith.muli %257, %265 : vector<4xi32>
%268 = vector.reduction <add>, %267, %266 : vector<4xi32> into i32
%269 = vector.insert %268, %cst_0 [0] : i32 into vector<1xi32>
%270 = vector.insert %269, %cst [0] : vector<1xi32> into vector<1x1xi32>
%271 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%272 = arith.extsi %271 : vector<4x1xi8> to vector<4x1xi32>
%273 = vector.extract %42[0] : vector<1x4xi32>
%274 = vector.extract %272[0, 0] : vector<4x1xi32>
%275 = vector.insert %274, %cst_1 [0] : i32 into vector<4xi32>
%276 = vector.extract %272[1, 0] : vector<4x1xi32>
%277 = vector.insert %276, %275 [1] : i32 into vector<4xi32>
%278 = vector.extract %272[2, 0] : vector<4x1xi32>
%279 = vector.insert %278, %277 [2] : i32 into vector<4xi32>
%280 = vector.extract %272[3, 0] : vector<4x1xi32>
%281 = vector.insert %280, %279 [3] : i32 into vector<4xi32>
%282 = vector.extract %38[0, 2] : vector<1x4xi32>
%283 = arith.muli %273, %281 : vector<4xi32>
%284 = vector.reduction <add>, %283, %282 : vector<4xi32> into i32
%285 = vector.insert %284, %cst_0 [0] : i32 into vector<1xi32>
%286 = vector.insert %285, %cst [0] : vector<1xi32> into vector<1x1xi32>
%287 = vector.extract_strided_slice %46 {offsets = [0, 3], sizes = [4, 1], strides = [1, 1]} : vector<4x4xi8> to vector<4x1xi8>
%288 = arith.extsi %287 : vector<4x1xi8> to vector<4x1xi32>
%289 = vector.extract %42[0] : vector<1x4xi32>
%290 = vector.extract %288[0, 0] : vector<4x1xi32>
%291 = vector.insert %290, %cst_1 [0] : i32 into vector<4xi32>
%292 = vector.extract %288[1, 0] : vector<4x1xi32>
%293 = vector.insert %292, %291 [1] : i32 into vector<4xi32>
%294 = vector.extract %288[2, 0] : vector<4x1xi32>
%295 = vector.insert %294, %293 [2] : i32 into vector<4xi32>
%296 = vector.extract %288[3, 0] : vector<4x1xi32>
%297 = vector.insert %296, %295 [3] : i32 into vector<4xi32>
%298 = vector.extract %38[0, 3] : vector<1x4xi32>
%299 = arith.muli %289, %297 : vector<4xi32>
%300 = vector.reduction <add>, %299, %298 : vector<4xi32> into i32
%301 = vector.insert %300, %cst_0 [0] : i32 into vector<1xi32>
%302 = vector.insert %301, %cst [0] : vector<1xi32> into vector<1x1xi32>
%303 = vector.insert_strided_slice %62, %cst_3 {offsets = [0, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%304 = vector.insert_strided_slice %78, %303 {offsets = [0, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%305 = vector.insert_strided_slice %94, %304 {offsets = [0, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%306 = vector.insert_strided_slice %110, %305 {offsets = [0, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%307 = vector.insert_strided_slice %126, %306 {offsets = [1, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%308 = vector.insert_strided_slice %142, %307 {offsets = [1, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%309 = vector.insert_strided_slice %158, %308 {offsets = [1, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%310 = vector.insert_strided_slice %174, %309 {offsets = [1, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%311 = vector.insert_strided_slice %190, %310 {offsets = [2, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%312 = vector.insert_strided_slice %206, %311 {offsets = [2, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%313 = vector.insert_strided_slice %222, %312 {offsets = [2, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%314 = vector.insert_strided_slice %238, %313 {offsets = [2, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%315 = vector.insert_strided_slice %254, %314 {offsets = [3, 0], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%316 = vector.insert_strided_slice %270, %315 {offsets = [3, 1], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%317 = vector.insert_strided_slice %286, %316 {offsets = [3, 2], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%318 = vector.insert_strided_slice %302, %317 {offsets = [3, 3], strides = [1, 1]} : vector<1x1xi32> into vector<4x4xi32>
%319 = vector.extract_strided_slice %318 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%320 = vector.transfer_write %319, %arg7[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%321 = vector.extract_strided_slice %318 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%322 = vector.transfer_write %321, %320[%c1, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%323 = vector.extract_strided_slice %318 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%324 = vector.transfer_write %323, %322[%c2, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
%325 = vector.extract_strided_slice %318 {offsets = [3, 0], sizes = [1, 4], strides = [1, 1]} : vector<4x4xi32> to vector<1x4xi32>
%326 = vector.transfer_write %325, %324[%c3, %c0] {in_bounds = [true, true]} : vector<1x4xi32>, tensor<4x4xi32>
scf.yield %326 : tensor<4x4xi32>
}
%inserted_slice = tensor.insert_slice %20 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %11 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
--- After trimming leading unit dims ---
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%cst = arith.constant dense<0> : vector<4x1xi8>
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<4xi32>
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c0_i8 = arith.constant 0 : i8
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %c1024 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %c1024 step %6 {
%7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %7) -> (tensor<8x32xi32>) {
%11 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%12 = vector.transfer_write %cst_1, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%13 = vector.transfer_write %cst_1, %12[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%14 = vector.transfer_write %cst_1, %13[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%15 = vector.transfer_write %cst_1, %14[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%16 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %15) -> (tensor<4x4xi32>) {
%17 = vector.transfer_read %8[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%18 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
%19 = vector.transfer_read %8[%18, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%20 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2)
%21 = vector.transfer_read %8[%20, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%22 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2)
%23 = vector.transfer_read %8[%22, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%24 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%25 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6)
%26 = vector.transfer_read %9[%25, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%27 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6)
%28 = vector.transfer_read %9[%27, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%29 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6)
%30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%31 = vector.transfer_read %arg7[%c0, %c0], %c0_i32 {in_bounds = [true]} : tensor<4x4xi32>, vector<4xi32>
%32 = vector.transfer_read %arg7[%c1, %c0], %c0_i32 {in_bounds = [true]} : tensor<4x4xi32>, vector<4xi32>
%33 = vector.transfer_read %arg7[%c2, %c0], %c0_i32 {in_bounds = [true]} : tensor<4x4xi32>, vector<4xi32>
%34 = vector.transfer_read %arg7[%c3, %c0], %c0_i32 {in_bounds = [true]} : tensor<4x4xi32>, vector<4xi32>
%35 = arith.extsi %17 : vector<4xi8> to vector<4xi32>
%36 = arith.extsi %19 : vector<4xi8> to vector<4xi32>
%37 = arith.extsi %21 : vector<4xi8> to vector<4xi32>
%38 = arith.extsi %23 : vector<4xi8> to vector<4xi32>
%39 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%40 = vector.insert %39, %cst [0] : vector<1xi8> into vector<4x1xi8>
%41 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%42 = vector.insert %41, %40 [1] : vector<1xi8> into vector<4x1xi8>
%43 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%44 = vector.insert %43, %42 [2] : vector<1xi8> into vector<4x1xi8>
%45 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%46 = vector.insert %45, %44 [3] : vector<1xi8> into vector<4x1xi8>
%47 = arith.extsi %46 : vector<4x1xi8> to vector<4x1xi32>
%48 = vector.extract %47[0, 0] : vector<4x1xi32>
%49 = vector.insert %48, %cst_1 [0] : i32 into vector<4xi32>
%50 = vector.extract %47[1, 0] : vector<4x1xi32>
%51 = vector.insert %50, %49 [1] : i32 into vector<4xi32>
%52 = vector.extract %47[2, 0] : vector<4x1xi32>
%53 = vector.insert %52, %51 [2] : i32 into vector<4xi32>
%54 = vector.extract %47[3, 0] : vector<4x1xi32>
%55 = vector.insert %54, %53 [3] : i32 into vector<4xi32>
%56 = vector.extract %31[0] : vector<4xi32>
%57 = arith.muli %35, %55 : vector<4xi32>
%58 = vector.reduction <add>, %57, %56 : vector<4xi32> into i32
%59 = vector.insert %58, %cst_0 [0] : i32 into vector<1xi32>
%60 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%61 = vector.insert %60, %cst [0] : vector<1xi8> into vector<4x1xi8>
%62 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%63 = vector.insert %62, %61 [1] : vector<1xi8> into vector<4x1xi8>
%64 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%65 = vector.insert %64, %63 [2] : vector<1xi8> into vector<4x1xi8>
%66 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%67 = vector.insert %66, %65 [3] : vector<1xi8> into vector<4x1xi8>
%68 = arith.extsi %67 : vector<4x1xi8> to vector<4x1xi32>
%69 = vector.extract %68[0, 0] : vector<4x1xi32>
%70 = vector.insert %69, %cst_1 [0] : i32 into vector<4xi32>
%71 = vector.extract %68[1, 0] : vector<4x1xi32>
%72 = vector.insert %71, %70 [1] : i32 into vector<4xi32>
%73 = vector.extract %68[2, 0] : vector<4x1xi32>
%74 = vector.insert %73, %72 [2] : i32 into vector<4xi32>
%75 = vector.extract %68[3, 0] : vector<4x1xi32>
%76 = vector.insert %75, %74 [3] : i32 into vector<4xi32>
%77 = vector.extract %31[1] : vector<4xi32>
%78 = arith.muli %35, %76 : vector<4xi32>
%79 = vector.reduction <add>, %78, %77 : vector<4xi32> into i32
%80 = vector.insert %79, %cst_0 [0] : i32 into vector<1xi32>
%81 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%82 = vector.insert %81, %cst [0] : vector<1xi8> into vector<4x1xi8>
%83 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%84 = vector.insert %83, %82 [1] : vector<1xi8> into vector<4x1xi8>
%85 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%86 = vector.insert %85, %84 [2] : vector<1xi8> into vector<4x1xi8>
%87 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%88 = vector.insert %87, %86 [3] : vector<1xi8> into vector<4x1xi8>
%89 = arith.extsi %88 : vector<4x1xi8> to vector<4x1xi32>
%90 = vector.extract %89[0, 0] : vector<4x1xi32>
%91 = vector.insert %90, %cst_1 [0] : i32 into vector<4xi32>
%92 = vector.extract %89[1, 0] : vector<4x1xi32>
%93 = vector.insert %92, %91 [1] : i32 into vector<4xi32>
%94 = vector.extract %89[2, 0] : vector<4x1xi32>
%95 = vector.insert %94, %93 [2] : i32 into vector<4xi32>
%96 = vector.extract %89[3, 0] : vector<4x1xi32>
%97 = vector.insert %96, %95 [3] : i32 into vector<4xi32>
%98 = vector.extract %31[2] : vector<4xi32>
%99 = arith.muli %35, %97 : vector<4xi32>
%100 = vector.reduction <add>, %99, %98 : vector<4xi32> into i32
%101 = vector.insert %100, %cst_0 [0] : i32 into vector<1xi32>
%102 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%103 = vector.insert %102, %cst [0] : vector<1xi8> into vector<4x1xi8>
%104 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%105 = vector.insert %104, %103 [1] : vector<1xi8> into vector<4x1xi8>
%106 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%107 = vector.insert %106, %105 [2] : vector<1xi8> into vector<4x1xi8>
%108 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%109 = vector.insert %108, %107 [3] : vector<1xi8> into vector<4x1xi8>
%110 = arith.extsi %109 : vector<4x1xi8> to vector<4x1xi32>
%111 = vector.extract %110[0, 0] : vector<4x1xi32>
%112 = vector.insert %111, %cst_1 [0] : i32 into vector<4xi32>
%113 = vector.extract %110[1, 0] : vector<4x1xi32>
%114 = vector.insert %113, %112 [1] : i32 into vector<4xi32>
%115 = vector.extract %110[2, 0] : vector<4x1xi32>
%116 = vector.insert %115, %114 [2] : i32 into vector<4xi32>
%117 = vector.extract %110[3, 0] : vector<4x1xi32>
%118 = vector.insert %117, %116 [3] : i32 into vector<4xi32>
%119 = vector.extract %31[3] : vector<4xi32>
%120 = arith.muli %35, %118 : vector<4xi32>
%121 = vector.reduction <add>, %120, %119 : vector<4xi32> into i32
%122 = vector.insert %121, %cst_0 [0] : i32 into vector<1xi32>
%123 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%124 = vector.insert %123, %cst [0] : vector<1xi8> into vector<4x1xi8>
%125 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%126 = vector.insert %125, %124 [1] : vector<1xi8> into vector<4x1xi8>
%127 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%128 = vector.insert %127, %126 [2] : vector<1xi8> into vector<4x1xi8>
%129 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%130 = vector.insert %129, %128 [3] : vector<1xi8> into vector<4x1xi8>
%131 = arith.extsi %130 : vector<4x1xi8> to vector<4x1xi32>
%132 = vector.extract %131[0, 0] : vector<4x1xi32>
%133 = vector.insert %132, %cst_1 [0] : i32 into vector<4xi32>
%134 = vector.extract %131[1, 0] : vector<4x1xi32>
%135 = vector.insert %134, %133 [1] : i32 into vector<4xi32>
%136 = vector.extract %131[2, 0] : vector<4x1xi32>
%137 = vector.insert %136, %135 [2] : i32 into vector<4xi32>
%138 = vector.extract %131[3, 0] : vector<4x1xi32>
%139 = vector.insert %138, %137 [3] : i32 into vector<4xi32>
%140 = vector.extract %32[0] : vector<4xi32>
%141 = arith.muli %36, %139 : vector<4xi32>
%142 = vector.reduction <add>, %141, %140 : vector<4xi32> into i32
%143 = vector.insert %142, %cst_0 [0] : i32 into vector<1xi32>
%144 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%145 = vector.insert %144, %cst [0] : vector<1xi8> into vector<4x1xi8>
%146 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%147 = vector.insert %146, %145 [1] : vector<1xi8> into vector<4x1xi8>
%148 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%149 = vector.insert %148, %147 [2] : vector<1xi8> into vector<4x1xi8>
%150 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%151 = vector.insert %150, %149 [3] : vector<1xi8> into vector<4x1xi8>
%152 = arith.extsi %151 : vector<4x1xi8> to vector<4x1xi32>
%153 = vector.extract %152[0, 0] : vector<4x1xi32>
%154 = vector.insert %153, %cst_1 [0] : i32 into vector<4xi32>
%155 = vector.extract %152[1, 0] : vector<4x1xi32>
%156 = vector.insert %155, %154 [1] : i32 into vector<4xi32>
%157 = vector.extract %152[2, 0] : vector<4x1xi32>
%158 = vector.insert %157, %156 [2] : i32 into vector<4xi32>
%159 = vector.extract %152[3, 0] : vector<4x1xi32>
%160 = vector.insert %159, %158 [3] : i32 into vector<4xi32>
%161 = vector.extract %32[1] : vector<4xi32>
%162 = arith.muli %36, %160 : vector<4xi32>
%163 = vector.reduction <add>, %162, %161 : vector<4xi32> into i32
%164 = vector.insert %163, %cst_0 [0] : i32 into vector<1xi32>
%165 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%166 = vector.insert %165, %cst [0] : vector<1xi8> into vector<4x1xi8>
%167 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%168 = vector.insert %167, %166 [1] : vector<1xi8> into vector<4x1xi8>
%169 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%170 = vector.insert %169, %168 [2] : vector<1xi8> into vector<4x1xi8>
%171 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%172 = vector.insert %171, %170 [3] : vector<1xi8> into vector<4x1xi8>
%173 = arith.extsi %172 : vector<4x1xi8> to vector<4x1xi32>
%174 = vector.extract %173[0, 0] : vector<4x1xi32>
%175 = vector.insert %174, %cst_1 [0] : i32 into vector<4xi32>
%176 = vector.extract %173[1, 0] : vector<4x1xi32>
%177 = vector.insert %176, %175 [1] : i32 into vector<4xi32>
%178 = vector.extract %173[2, 0] : vector<4x1xi32>
%179 = vector.insert %178, %177 [2] : i32 into vector<4xi32>
%180 = vector.extract %173[3, 0] : vector<4x1xi32>
%181 = vector.insert %180, %179 [3] : i32 into vector<4xi32>
%182 = vector.extract %32[2] : vector<4xi32>
%183 = arith.muli %36, %181 : vector<4xi32>
%184 = vector.reduction <add>, %183, %182 : vector<4xi32> into i32
%185 = vector.insert %184, %cst_0 [0] : i32 into vector<1xi32>
%186 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%187 = vector.insert %186, %cst [0] : vector<1xi8> into vector<4x1xi8>
%188 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%189 = vector.insert %188, %187 [1] : vector<1xi8> into vector<4x1xi8>
%190 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%191 = vector.insert %190, %189 [2] : vector<1xi8> into vector<4x1xi8>
%192 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%193 = vector.insert %192, %191 [3] : vector<1xi8> into vector<4x1xi8>
%194 = arith.extsi %193 : vector<4x1xi8> to vector<4x1xi32>
%195 = vector.extract %194[0, 0] : vector<4x1xi32>
%196 = vector.insert %195, %cst_1 [0] : i32 into vector<4xi32>
%197 = vector.extract %194[1, 0] : vector<4x1xi32>
%198 = vector.insert %197, %196 [1] : i32 into vector<4xi32>
%199 = vector.extract %194[2, 0] : vector<4x1xi32>
%200 = vector.insert %199, %198 [2] : i32 into vector<4xi32>
%201 = vector.extract %194[3, 0] : vector<4x1xi32>
%202 = vector.insert %201, %200 [3] : i32 into vector<4xi32>
%203 = vector.extract %32[3] : vector<4xi32>
%204 = arith.muli %36, %202 : vector<4xi32>
%205 = vector.reduction <add>, %204, %203 : vector<4xi32> into i32
%206 = vector.insert %205, %cst_0 [0] : i32 into vector<1xi32>
%207 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%208 = vector.insert %207, %cst [0] : vector<1xi8> into vector<4x1xi8>
%209 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%210 = vector.insert %209, %208 [1] : vector<1xi8> into vector<4x1xi8>
%211 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%212 = vector.insert %211, %210 [2] : vector<1xi8> into vector<4x1xi8>
%213 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%214 = vector.insert %213, %212 [3] : vector<1xi8> into vector<4x1xi8>
%215 = arith.extsi %214 : vector<4x1xi8> to vector<4x1xi32>
%216 = vector.extract %215[0, 0] : vector<4x1xi32>
%217 = vector.insert %216, %cst_1 [0] : i32 into vector<4xi32>
%218 = vector.extract %215[1, 0] : vector<4x1xi32>
%219 = vector.insert %218, %217 [1] : i32 into vector<4xi32>
%220 = vector.extract %215[2, 0] : vector<4x1xi32>
%221 = vector.insert %220, %219 [2] : i32 into vector<4xi32>
%222 = vector.extract %215[3, 0] : vector<4x1xi32>
%223 = vector.insert %222, %221 [3] : i32 into vector<4xi32>
%224 = vector.extract %33[0] : vector<4xi32>
%225 = arith.muli %37, %223 : vector<4xi32>
%226 = vector.reduction <add>, %225, %224 : vector<4xi32> into i32
%227 = vector.insert %226, %cst_0 [0] : i32 into vector<1xi32>
%228 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%229 = vector.insert %228, %cst [0] : vector<1xi8> into vector<4x1xi8>
%230 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%231 = vector.insert %230, %229 [1] : vector<1xi8> into vector<4x1xi8>
%232 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%233 = vector.insert %232, %231 [2] : vector<1xi8> into vector<4x1xi8>
%234 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%235 = vector.insert %234, %233 [3] : vector<1xi8> into vector<4x1xi8>
%236 = arith.extsi %235 : vector<4x1xi8> to vector<4x1xi32>
%237 = vector.extract %236[0, 0] : vector<4x1xi32>
%238 = vector.insert %237, %cst_1 [0] : i32 into vector<4xi32>
%239 = vector.extract %236[1, 0] : vector<4x1xi32>
%240 = vector.insert %239, %238 [1] : i32 into vector<4xi32>
%241 = vector.extract %236[2, 0] : vector<4x1xi32>
%242 = vector.insert %241, %240 [2] : i32 into vector<4xi32>
%243 = vector.extract %236[3, 0] : vector<4x1xi32>
%244 = vector.insert %243, %242 [3] : i32 into vector<4xi32>
%245 = vector.extract %33[1] : vector<4xi32>
%246 = arith.muli %37, %244 : vector<4xi32>
%247 = vector.reduction <add>, %246, %245 : vector<4xi32> into i32
%248 = vector.insert %247, %cst_0 [0] : i32 into vector<1xi32>
%249 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%250 = vector.insert %249, %cst [0] : vector<1xi8> into vector<4x1xi8>
%251 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%252 = vector.insert %251, %250 [1] : vector<1xi8> into vector<4x1xi8>
%253 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%254 = vector.insert %253, %252 [2] : vector<1xi8> into vector<4x1xi8>
%255 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%256 = vector.insert %255, %254 [3] : vector<1xi8> into vector<4x1xi8>
%257 = arith.extsi %256 : vector<4x1xi8> to vector<4x1xi32>
%258 = vector.extract %257[0, 0] : vector<4x1xi32>
%259 = vector.insert %258, %cst_1 [0] : i32 into vector<4xi32>
%260 = vector.extract %257[1, 0] : vector<4x1xi32>
%261 = vector.insert %260, %259 [1] : i32 into vector<4xi32>
%262 = vector.extract %257[2, 0] : vector<4x1xi32>
%263 = vector.insert %262, %261 [2] : i32 into vector<4xi32>
%264 = vector.extract %257[3, 0] : vector<4x1xi32>
%265 = vector.insert %264, %263 [3] : i32 into vector<4xi32>
%266 = vector.extract %33[2] : vector<4xi32>
%267 = arith.muli %37, %265 : vector<4xi32>
%268 = vector.reduction <add>, %267, %266 : vector<4xi32> into i32
%269 = vector.insert %268, %cst_0 [0] : i32 into vector<1xi32>
%270 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%271 = vector.insert %270, %cst [0] : vector<1xi8> into vector<4x1xi8>
%272 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%273 = vector.insert %272, %271 [1] : vector<1xi8> into vector<4x1xi8>
%274 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%275 = vector.insert %274, %273 [2] : vector<1xi8> into vector<4x1xi8>
%276 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%277 = vector.insert %276, %275 [3] : vector<1xi8> into vector<4x1xi8>
%278 = arith.extsi %277 : vector<4x1xi8> to vector<4x1xi32>
%279 = vector.extract %278[0, 0] : vector<4x1xi32>
%280 = vector.insert %279, %cst_1 [0] : i32 into vector<4xi32>
%281 = vector.extract %278[1, 0] : vector<4x1xi32>
%282 = vector.insert %281, %280 [1] : i32 into vector<4xi32>
%283 = vector.extract %278[2, 0] : vector<4x1xi32>
%284 = vector.insert %283, %282 [2] : i32 into vector<4xi32>
%285 = vector.extract %278[3, 0] : vector<4x1xi32>
%286 = vector.insert %285, %284 [3] : i32 into vector<4xi32>
%287 = vector.extract %33[3] : vector<4xi32>
%288 = arith.muli %37, %286 : vector<4xi32>
%289 = vector.reduction <add>, %288, %287 : vector<4xi32> into i32
%290 = vector.insert %289, %cst_0 [0] : i32 into vector<1xi32>
%291 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%292 = vector.insert %291, %cst [0] : vector<1xi8> into vector<4x1xi8>
%293 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%294 = vector.insert %293, %292 [1] : vector<1xi8> into vector<4x1xi8>
%295 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%296 = vector.insert %295, %294 [2] : vector<1xi8> into vector<4x1xi8>
%297 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%298 = vector.insert %297, %296 [3] : vector<1xi8> into vector<4x1xi8>
%299 = arith.extsi %298 : vector<4x1xi8> to vector<4x1xi32>
%300 = vector.extract %299[0, 0] : vector<4x1xi32>
%301 = vector.insert %300, %cst_1 [0] : i32 into vector<4xi32>
%302 = vector.extract %299[1, 0] : vector<4x1xi32>
%303 = vector.insert %302, %301 [1] : i32 into vector<4xi32>
%304 = vector.extract %299[2, 0] : vector<4x1xi32>
%305 = vector.insert %304, %303 [2] : i32 into vector<4xi32>
%306 = vector.extract %299[3, 0] : vector<4x1xi32>
%307 = vector.insert %306, %305 [3] : i32 into vector<4xi32>
%308 = vector.extract %34[0] : vector<4xi32>
%309 = arith.muli %38, %307 : vector<4xi32>
%310 = vector.reduction <add>, %309, %308 : vector<4xi32> into i32
%311 = vector.insert %310, %cst_0 [0] : i32 into vector<1xi32>
%312 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%313 = vector.insert %312, %cst [0] : vector<1xi8> into vector<4x1xi8>
%314 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%315 = vector.insert %314, %313 [1] : vector<1xi8> into vector<4x1xi8>
%316 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%317 = vector.insert %316, %315 [2] : vector<1xi8> into vector<4x1xi8>
%318 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%319 = vector.insert %318, %317 [3] : vector<1xi8> into vector<4x1xi8>
%320 = arith.extsi %319 : vector<4x1xi8> to vector<4x1xi32>
%321 = vector.extract %320[0, 0] : vector<4x1xi32>
%322 = vector.insert %321, %cst_1 [0] : i32 into vector<4xi32>
%323 = vector.extract %320[1, 0] : vector<4x1xi32>
%324 = vector.insert %323, %322 [1] : i32 into vector<4xi32>
%325 = vector.extract %320[2, 0] : vector<4x1xi32>
%326 = vector.insert %325, %324 [2] : i32 into vector<4xi32>
%327 = vector.extract %320[3, 0] : vector<4x1xi32>
%328 = vector.insert %327, %326 [3] : i32 into vector<4xi32>
%329 = vector.extract %34[1] : vector<4xi32>
%330 = arith.muli %38, %328 : vector<4xi32>
%331 = vector.reduction <add>, %330, %329 : vector<4xi32> into i32
%332 = vector.insert %331, %cst_0 [0] : i32 into vector<1xi32>
%333 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%334 = vector.insert %333, %cst [0] : vector<1xi8> into vector<4x1xi8>
%335 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%336 = vector.insert %335, %334 [1] : vector<1xi8> into vector<4x1xi8>
%337 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%338 = vector.insert %337, %336 [2] : vector<1xi8> into vector<4x1xi8>
%339 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%340 = vector.insert %339, %338 [3] : vector<1xi8> into vector<4x1xi8>
%341 = arith.extsi %340 : vector<4x1xi8> to vector<4x1xi32>
%342 = vector.extract %341[0, 0] : vector<4x1xi32>
%343 = vector.insert %342, %cst_1 [0] : i32 into vector<4xi32>
%344 = vector.extract %341[1, 0] : vector<4x1xi32>
%345 = vector.insert %344, %343 [1] : i32 into vector<4xi32>
%346 = vector.extract %341[2, 0] : vector<4x1xi32>
%347 = vector.insert %346, %345 [2] : i32 into vector<4xi32>
%348 = vector.extract %341[3, 0] : vector<4x1xi32>
%349 = vector.insert %348, %347 [3] : i32 into vector<4xi32>
%350 = vector.extract %34[2] : vector<4xi32>
%351 = arith.muli %38, %349 : vector<4xi32>
%352 = vector.reduction <add>, %351, %350 : vector<4xi32> into i32
%353 = vector.insert %352, %cst_0 [0] : i32 into vector<1xi32>
%354 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%355 = vector.insert %354, %cst [0] : vector<1xi8> into vector<4x1xi8>
%356 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%357 = vector.insert %356, %355 [1] : vector<1xi8> into vector<4x1xi8>
%358 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%359 = vector.insert %358, %357 [2] : vector<1xi8> into vector<4x1xi8>
%360 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%361 = vector.insert %360, %359 [3] : vector<1xi8> into vector<4x1xi8>
%362 = arith.extsi %361 : vector<4x1xi8> to vector<4x1xi32>
%363 = vector.extract %362[0, 0] : vector<4x1xi32>
%364 = vector.insert %363, %cst_1 [0] : i32 into vector<4xi32>
%365 = vector.extract %362[1, 0] : vector<4x1xi32>
%366 = vector.insert %365, %364 [1] : i32 into vector<4xi32>
%367 = vector.extract %362[2, 0] : vector<4x1xi32>
%368 = vector.insert %367, %366 [2] : i32 into vector<4xi32>
%369 = vector.extract %362[3, 0] : vector<4x1xi32>
%370 = vector.insert %369, %368 [3] : i32 into vector<4xi32>
%371 = vector.extract %34[3] : vector<4xi32>
%372 = arith.muli %38, %370 : vector<4xi32>
%373 = vector.reduction <add>, %372, %371 : vector<4xi32> into i32
%374 = vector.insert %373, %cst_0 [0] : i32 into vector<1xi32>
%375 = vector.insert_strided_slice %59, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%376 = vector.insert_strided_slice %80, %375 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%377 = vector.insert_strided_slice %101, %376 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%378 = vector.insert_strided_slice %122, %377 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%379 = vector.insert_strided_slice %143, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%380 = vector.insert_strided_slice %164, %379 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%381 = vector.insert_strided_slice %185, %380 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%382 = vector.insert_strided_slice %206, %381 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%383 = vector.insert_strided_slice %227, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%384 = vector.insert_strided_slice %248, %383 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%385 = vector.insert_strided_slice %269, %384 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%386 = vector.insert_strided_slice %290, %385 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%387 = vector.insert_strided_slice %311, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%388 = vector.insert_strided_slice %332, %387 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%389 = vector.insert_strided_slice %353, %388 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%390 = vector.insert_strided_slice %374, %389 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%391 = vector.transfer_write %378, %arg7[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%392 = vector.transfer_write %382, %391[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%393 = vector.transfer_write %386, %392[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%394 = vector.transfer_write %390, %393[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
scf.yield %394 : tensor<4x4xi32>
}
%inserted_slice = tensor.insert_slice %16 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %11 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
--- After hoisting transfers ---
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%cst = arith.constant dense<0> : vector<4x1xi8>
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<4xi32>
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c0_i8 = arith.constant 0 : i8
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c1024 step %4 {
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
scf.for %arg1 = %5 to %c1024 step %6 {
%8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %8) -> (tensor<8x32xi32>) {
%11 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
%12 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2)
%14 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%15 = vector.transfer_write %cst_1, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%16 = vector.transfer_write %cst_1, %15[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%17 = vector.transfer_write %cst_1, %16[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%18 = vector.transfer_write %cst_1, %17[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%19:4 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %cst_1, %arg8 = %cst_1, %arg9 = %cst_1, %arg10 = %cst_1) -> (vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>) {
%24 = vector.transfer_read %7[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%25 = vector.transfer_read %7[%11, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%26 = vector.transfer_read %7[%12, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%27 = vector.transfer_read %7[%13, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%28 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%29 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6)
%30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%31 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6)
%32 = vector.transfer_read %9[%31, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%33 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6)
%34 = vector.transfer_read %9[%33, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%35 = arith.extsi %24 : vector<4xi8> to vector<4xi32>
%36 = arith.extsi %25 : vector<4xi8> to vector<4xi32>
%37 = arith.extsi %26 : vector<4xi8> to vector<4xi32>
%38 = arith.extsi %27 : vector<4xi8> to vector<4xi32>
%39 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%40 = vector.insert %39, %cst [0] : vector<1xi8> into vector<4x1xi8>
%41 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%42 = vector.insert %41, %40 [1] : vector<1xi8> into vector<4x1xi8>
%43 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%44 = vector.insert %43, %42 [2] : vector<1xi8> into vector<4x1xi8>
%45 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%46 = vector.insert %45, %44 [3] : vector<1xi8> into vector<4x1xi8>
%47 = arith.extsi %46 : vector<4x1xi8> to vector<4x1xi32>
%48 = vector.extract %47[0, 0] : vector<4x1xi32>
%49 = vector.insert %48, %cst_1 [0] : i32 into vector<4xi32>
%50 = vector.extract %47[1, 0] : vector<4x1xi32>
%51 = vector.insert %50, %49 [1] : i32 into vector<4xi32>
%52 = vector.extract %47[2, 0] : vector<4x1xi32>
%53 = vector.insert %52, %51 [2] : i32 into vector<4xi32>
%54 = vector.extract %47[3, 0] : vector<4x1xi32>
%55 = vector.insert %54, %53 [3] : i32 into vector<4xi32>
%56 = vector.extract %arg10[0] : vector<4xi32>
%57 = arith.muli %35, %55 : vector<4xi32>
%58 = vector.reduction <add>, %57, %56 : vector<4xi32> into i32
%59 = vector.insert %58, %cst_0 [0] : i32 into vector<1xi32>
%60 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%61 = vector.insert %60, %cst [0] : vector<1xi8> into vector<4x1xi8>
%62 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%63 = vector.insert %62, %61 [1] : vector<1xi8> into vector<4x1xi8>
%64 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%65 = vector.insert %64, %63 [2] : vector<1xi8> into vector<4x1xi8>
%66 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%67 = vector.insert %66, %65 [3] : vector<1xi8> into vector<4x1xi8>
%68 = arith.extsi %67 : vector<4x1xi8> to vector<4x1xi32>
%69 = vector.extract %68[0, 0] : vector<4x1xi32>
%70 = vector.insert %69, %cst_1 [0] : i32 into vector<4xi32>
%71 = vector.extract %68[1, 0] : vector<4x1xi32>
%72 = vector.insert %71, %70 [1] : i32 into vector<4xi32>
%73 = vector.extract %68[2, 0] : vector<4x1xi32>
%74 = vector.insert %73, %72 [2] : i32 into vector<4xi32>
%75 = vector.extract %68[3, 0] : vector<4x1xi32>
%76 = vector.insert %75, %74 [3] : i32 into vector<4xi32>
%77 = vector.extract %arg10[1] : vector<4xi32>
%78 = arith.muli %35, %76 : vector<4xi32>
%79 = vector.reduction <add>, %78, %77 : vector<4xi32> into i32
%80 = vector.insert %79, %cst_0 [0] : i32 into vector<1xi32>
%81 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%82 = vector.insert %81, %cst [0] : vector<1xi8> into vector<4x1xi8>
%83 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%84 = vector.insert %83, %82 [1] : vector<1xi8> into vector<4x1xi8>
%85 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%86 = vector.insert %85, %84 [2] : vector<1xi8> into vector<4x1xi8>
%87 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%88 = vector.insert %87, %86 [3] : vector<1xi8> into vector<4x1xi8>
%89 = arith.extsi %88 : vector<4x1xi8> to vector<4x1xi32>
%90 = vector.extract %89[0, 0] : vector<4x1xi32>
%91 = vector.insert %90, %cst_1 [0] : i32 into vector<4xi32>
%92 = vector.extract %89[1, 0] : vector<4x1xi32>
%93 = vector.insert %92, %91 [1] : i32 into vector<4xi32>
%94 = vector.extract %89[2, 0] : vector<4x1xi32>
%95 = vector.insert %94, %93 [2] : i32 into vector<4xi32>
%96 = vector.extract %89[3, 0] : vector<4x1xi32>
%97 = vector.insert %96, %95 [3] : i32 into vector<4xi32>
%98 = vector.extract %arg10[2] : vector<4xi32>
%99 = arith.muli %35, %97 : vector<4xi32>
%100 = vector.reduction <add>, %99, %98 : vector<4xi32> into i32
%101 = vector.insert %100, %cst_0 [0] : i32 into vector<1xi32>
%102 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%103 = vector.insert %102, %cst [0] : vector<1xi8> into vector<4x1xi8>
%104 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%105 = vector.insert %104, %103 [1] : vector<1xi8> into vector<4x1xi8>
%106 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%107 = vector.insert %106, %105 [2] : vector<1xi8> into vector<4x1xi8>
%108 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%109 = vector.insert %108, %107 [3] : vector<1xi8> into vector<4x1xi8>
%110 = arith.extsi %109 : vector<4x1xi8> to vector<4x1xi32>
%111 = vector.extract %110[0, 0] : vector<4x1xi32>
%112 = vector.insert %111, %cst_1 [0] : i32 into vector<4xi32>
%113 = vector.extract %110[1, 0] : vector<4x1xi32>
%114 = vector.insert %113, %112 [1] : i32 into vector<4xi32>
%115 = vector.extract %110[2, 0] : vector<4x1xi32>
%116 = vector.insert %115, %114 [2] : i32 into vector<4xi32>
%117 = vector.extract %110[3, 0] : vector<4x1xi32>
%118 = vector.insert %117, %116 [3] : i32 into vector<4xi32>
%119 = vector.extract %arg10[3] : vector<4xi32>
%120 = arith.muli %35, %118 : vector<4xi32>
%121 = vector.reduction <add>, %120, %119 : vector<4xi32> into i32
%122 = vector.insert %121, %cst_0 [0] : i32 into vector<1xi32>
%123 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%124 = vector.insert %123, %cst [0] : vector<1xi8> into vector<4x1xi8>
%125 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%126 = vector.insert %125, %124 [1] : vector<1xi8> into vector<4x1xi8>
%127 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%128 = vector.insert %127, %126 [2] : vector<1xi8> into vector<4x1xi8>
%129 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%130 = vector.insert %129, %128 [3] : vector<1xi8> into vector<4x1xi8>
%131 = arith.extsi %130 : vector<4x1xi8> to vector<4x1xi32>
%132 = vector.extract %131[0, 0] : vector<4x1xi32>
%133 = vector.insert %132, %cst_1 [0] : i32 into vector<4xi32>
%134 = vector.extract %131[1, 0] : vector<4x1xi32>
%135 = vector.insert %134, %133 [1] : i32 into vector<4xi32>
%136 = vector.extract %131[2, 0] : vector<4x1xi32>
%137 = vector.insert %136, %135 [2] : i32 into vector<4xi32>
%138 = vector.extract %131[3, 0] : vector<4x1xi32>
%139 = vector.insert %138, %137 [3] : i32 into vector<4xi32>
%140 = vector.extract %arg9[0] : vector<4xi32>
%141 = arith.muli %36, %139 : vector<4xi32>
%142 = vector.reduction <add>, %141, %140 : vector<4xi32> into i32
%143 = vector.insert %142, %cst_0 [0] : i32 into vector<1xi32>
%144 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%145 = vector.insert %144, %cst [0] : vector<1xi8> into vector<4x1xi8>
%146 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%147 = vector.insert %146, %145 [1] : vector<1xi8> into vector<4x1xi8>
%148 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%149 = vector.insert %148, %147 [2] : vector<1xi8> into vector<4x1xi8>
%150 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%151 = vector.insert %150, %149 [3] : vector<1xi8> into vector<4x1xi8>
%152 = arith.extsi %151 : vector<4x1xi8> to vector<4x1xi32>
%153 = vector.extract %152[0, 0] : vector<4x1xi32>
%154 = vector.insert %153, %cst_1 [0] : i32 into vector<4xi32>
%155 = vector.extract %152[1, 0] : vector<4x1xi32>
%156 = vector.insert %155, %154 [1] : i32 into vector<4xi32>
%157 = vector.extract %152[2, 0] : vector<4x1xi32>
%158 = vector.insert %157, %156 [2] : i32 into vector<4xi32>
%159 = vector.extract %152[3, 0] : vector<4x1xi32>
%160 = vector.insert %159, %158 [3] : i32 into vector<4xi32>
%161 = vector.extract %arg9[1] : vector<4xi32>
%162 = arith.muli %36, %160 : vector<4xi32>
%163 = vector.reduction <add>, %162, %161 : vector<4xi32> into i32
%164 = vector.insert %163, %cst_0 [0] : i32 into vector<1xi32>
%165 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%166 = vector.insert %165, %cst [0] : vector<1xi8> into vector<4x1xi8>
%167 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%168 = vector.insert %167, %166 [1] : vector<1xi8> into vector<4x1xi8>
%169 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%170 = vector.insert %169, %168 [2] : vector<1xi8> into vector<4x1xi8>
%171 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%172 = vector.insert %171, %170 [3] : vector<1xi8> into vector<4x1xi8>
%173 = arith.extsi %172 : vector<4x1xi8> to vector<4x1xi32>
%174 = vector.extract %173[0, 0] : vector<4x1xi32>
%175 = vector.insert %174, %cst_1 [0] : i32 into vector<4xi32>
%176 = vector.extract %173[1, 0] : vector<4x1xi32>
%177 = vector.insert %176, %175 [1] : i32 into vector<4xi32>
%178 = vector.extract %173[2, 0] : vector<4x1xi32>
%179 = vector.insert %178, %177 [2] : i32 into vector<4xi32>
%180 = vector.extract %173[3, 0] : vector<4x1xi32>
%181 = vector.insert %180, %179 [3] : i32 into vector<4xi32>
%182 = vector.extract %arg9[2] : vector<4xi32>
%183 = arith.muli %36, %181 : vector<4xi32>
%184 = vector.reduction <add>, %183, %182 : vector<4xi32> into i32
%185 = vector.insert %184, %cst_0 [0] : i32 into vector<1xi32>
%186 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%187 = vector.insert %186, %cst [0] : vector<1xi8> into vector<4x1xi8>
%188 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%189 = vector.insert %188, %187 [1] : vector<1xi8> into vector<4x1xi8>
%190 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%191 = vector.insert %190, %189 [2] : vector<1xi8> into vector<4x1xi8>
%192 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%193 = vector.insert %192, %191 [3] : vector<1xi8> into vector<4x1xi8>
%194 = arith.extsi %193 : vector<4x1xi8> to vector<4x1xi32>
%195 = vector.extract %194[0, 0] : vector<4x1xi32>
%196 = vector.insert %195, %cst_1 [0] : i32 into vector<4xi32>
%197 = vector.extract %194[1, 0] : vector<4x1xi32>
%198 = vector.insert %197, %196 [1] : i32 into vector<4xi32>
%199 = vector.extract %194[2, 0] : vector<4x1xi32>
%200 = vector.insert %199, %198 [2] : i32 into vector<4xi32>
%201 = vector.extract %194[3, 0] : vector<4x1xi32>
%202 = vector.insert %201, %200 [3] : i32 into vector<4xi32>
%203 = vector.extract %arg9[3] : vector<4xi32>
%204 = arith.muli %36, %202 : vector<4xi32>
%205 = vector.reduction <add>, %204, %203 : vector<4xi32> into i32
%206 = vector.insert %205, %cst_0 [0] : i32 into vector<1xi32>
%207 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%208 = vector.insert %207, %cst [0] : vector<1xi8> into vector<4x1xi8>
%209 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%210 = vector.insert %209, %208 [1] : vector<1xi8> into vector<4x1xi8>
%211 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%212 = vector.insert %211, %210 [2] : vector<1xi8> into vector<4x1xi8>
%213 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%214 = vector.insert %213, %212 [3] : vector<1xi8> into vector<4x1xi8>
%215 = arith.extsi %214 : vector<4x1xi8> to vector<4x1xi32>
%216 = vector.extract %215[0, 0] : vector<4x1xi32>
%217 = vector.insert %216, %cst_1 [0] : i32 into vector<4xi32>
%218 = vector.extract %215[1, 0] : vector<4x1xi32>
%219 = vector.insert %218, %217 [1] : i32 into vector<4xi32>
%220 = vector.extract %215[2, 0] : vector<4x1xi32>
%221 = vector.insert %220, %219 [2] : i32 into vector<4xi32>
%222 = vector.extract %215[3, 0] : vector<4x1xi32>
%223 = vector.insert %222, %221 [3] : i32 into vector<4xi32>
%224 = vector.extract %arg8[0] : vector<4xi32>
%225 = arith.muli %37, %223 : vector<4xi32>
%226 = vector.reduction <add>, %225, %224 : vector<4xi32> into i32
%227 = vector.insert %226, %cst_0 [0] : i32 into vector<1xi32>
%228 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%229 = vector.insert %228, %cst [0] : vector<1xi8> into vector<4x1xi8>
%230 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%231 = vector.insert %230, %229 [1] : vector<1xi8> into vector<4x1xi8>
%232 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%233 = vector.insert %232, %231 [2] : vector<1xi8> into vector<4x1xi8>
%234 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%235 = vector.insert %234, %233 [3] : vector<1xi8> into vector<4x1xi8>
%236 = arith.extsi %235 : vector<4x1xi8> to vector<4x1xi32>
%237 = vector.extract %236[0, 0] : vector<4x1xi32>
%238 = vector.insert %237, %cst_1 [0] : i32 into vector<4xi32>
%239 = vector.extract %236[1, 0] : vector<4x1xi32>
%240 = vector.insert %239, %238 [1] : i32 into vector<4xi32>
%241 = vector.extract %236[2, 0] : vector<4x1xi32>
%242 = vector.insert %241, %240 [2] : i32 into vector<4xi32>
%243 = vector.extract %236[3, 0] : vector<4x1xi32>
%244 = vector.insert %243, %242 [3] : i32 into vector<4xi32>
%245 = vector.extract %arg8[1] : vector<4xi32>
%246 = arith.muli %37, %244 : vector<4xi32>
%247 = vector.reduction <add>, %246, %245 : vector<4xi32> into i32
%248 = vector.insert %247, %cst_0 [0] : i32 into vector<1xi32>
%249 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%250 = vector.insert %249, %cst [0] : vector<1xi8> into vector<4x1xi8>
%251 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%252 = vector.insert %251, %250 [1] : vector<1xi8> into vector<4x1xi8>
%253 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%254 = vector.insert %253, %252 [2] : vector<1xi8> into vector<4x1xi8>
%255 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%256 = vector.insert %255, %254 [3] : vector<1xi8> into vector<4x1xi8>
%257 = arith.extsi %256 : vector<4x1xi8> to vector<4x1xi32>
%258 = vector.extract %257[0, 0] : vector<4x1xi32>
%259 = vector.insert %258, %cst_1 [0] : i32 into vector<4xi32>
%260 = vector.extract %257[1, 0] : vector<4x1xi32>
%261 = vector.insert %260, %259 [1] : i32 into vector<4xi32>
%262 = vector.extract %257[2, 0] : vector<4x1xi32>
%263 = vector.insert %262, %261 [2] : i32 into vector<4xi32>
%264 = vector.extract %257[3, 0] : vector<4x1xi32>
%265 = vector.insert %264, %263 [3] : i32 into vector<4xi32>
%266 = vector.extract %arg8[2] : vector<4xi32>
%267 = arith.muli %37, %265 : vector<4xi32>
%268 = vector.reduction <add>, %267, %266 : vector<4xi32> into i32
%269 = vector.insert %268, %cst_0 [0] : i32 into vector<1xi32>
%270 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%271 = vector.insert %270, %cst [0] : vector<1xi8> into vector<4x1xi8>
%272 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%273 = vector.insert %272, %271 [1] : vector<1xi8> into vector<4x1xi8>
%274 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%275 = vector.insert %274, %273 [2] : vector<1xi8> into vector<4x1xi8>
%276 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%277 = vector.insert %276, %275 [3] : vector<1xi8> into vector<4x1xi8>
%278 = arith.extsi %277 : vector<4x1xi8> to vector<4x1xi32>
%279 = vector.extract %278[0, 0] : vector<4x1xi32>
%280 = vector.insert %279, %cst_1 [0] : i32 into vector<4xi32>
%281 = vector.extract %278[1, 0] : vector<4x1xi32>
%282 = vector.insert %281, %280 [1] : i32 into vector<4xi32>
%283 = vector.extract %278[2, 0] : vector<4x1xi32>
%284 = vector.insert %283, %282 [2] : i32 into vector<4xi32>
%285 = vector.extract %278[3, 0] : vector<4x1xi32>
%286 = vector.insert %285, %284 [3] : i32 into vector<4xi32>
%287 = vector.extract %arg8[3] : vector<4xi32>
%288 = arith.muli %37, %286 : vector<4xi32>
%289 = vector.reduction <add>, %288, %287 : vector<4xi32> into i32
%290 = vector.insert %289, %cst_0 [0] : i32 into vector<1xi32>
%291 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%292 = vector.insert %291, %cst [0] : vector<1xi8> into vector<4x1xi8>
%293 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%294 = vector.insert %293, %292 [1] : vector<1xi8> into vector<4x1xi8>
%295 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%296 = vector.insert %295, %294 [2] : vector<1xi8> into vector<4x1xi8>
%297 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%298 = vector.insert %297, %296 [3] : vector<1xi8> into vector<4x1xi8>
%299 = arith.extsi %298 : vector<4x1xi8> to vector<4x1xi32>
%300 = vector.extract %299[0, 0] : vector<4x1xi32>
%301 = vector.insert %300, %cst_1 [0] : i32 into vector<4xi32>
%302 = vector.extract %299[1, 0] : vector<4x1xi32>
%303 = vector.insert %302, %301 [1] : i32 into vector<4xi32>
%304 = vector.extract %299[2, 0] : vector<4x1xi32>
%305 = vector.insert %304, %303 [2] : i32 into vector<4xi32>
%306 = vector.extract %299[3, 0] : vector<4x1xi32>
%307 = vector.insert %306, %305 [3] : i32 into vector<4xi32>
%308 = vector.extract %arg7[0] : vector<4xi32>
%309 = arith.muli %38, %307 : vector<4xi32>
%310 = vector.reduction <add>, %309, %308 : vector<4xi32> into i32
%311 = vector.insert %310, %cst_0 [0] : i32 into vector<1xi32>
%312 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%313 = vector.insert %312, %cst [0] : vector<1xi8> into vector<4x1xi8>
%314 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%315 = vector.insert %314, %313 [1] : vector<1xi8> into vector<4x1xi8>
%316 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%317 = vector.insert %316, %315 [2] : vector<1xi8> into vector<4x1xi8>
%318 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%319 = vector.insert %318, %317 [3] : vector<1xi8> into vector<4x1xi8>
%320 = arith.extsi %319 : vector<4x1xi8> to vector<4x1xi32>
%321 = vector.extract %320[0, 0] : vector<4x1xi32>
%322 = vector.insert %321, %cst_1 [0] : i32 into vector<4xi32>
%323 = vector.extract %320[1, 0] : vector<4x1xi32>
%324 = vector.insert %323, %322 [1] : i32 into vector<4xi32>
%325 = vector.extract %320[2, 0] : vector<4x1xi32>
%326 = vector.insert %325, %324 [2] : i32 into vector<4xi32>
%327 = vector.extract %320[3, 0] : vector<4x1xi32>
%328 = vector.insert %327, %326 [3] : i32 into vector<4xi32>
%329 = vector.extract %arg7[1] : vector<4xi32>
%330 = arith.muli %38, %328 : vector<4xi32>
%331 = vector.reduction <add>, %330, %329 : vector<4xi32> into i32
%332 = vector.insert %331, %cst_0 [0] : i32 into vector<1xi32>
%333 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%334 = vector.insert %333, %cst [0] : vector<1xi8> into vector<4x1xi8>
%335 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%336 = vector.insert %335, %334 [1] : vector<1xi8> into vector<4x1xi8>
%337 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%338 = vector.insert %337, %336 [2] : vector<1xi8> into vector<4x1xi8>
%339 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%340 = vector.insert %339, %338 [3] : vector<1xi8> into vector<4x1xi8>
%341 = arith.extsi %340 : vector<4x1xi8> to vector<4x1xi32>
%342 = vector.extract %341[0, 0] : vector<4x1xi32>
%343 = vector.insert %342, %cst_1 [0] : i32 into vector<4xi32>
%344 = vector.extract %341[1, 0] : vector<4x1xi32>
%345 = vector.insert %344, %343 [1] : i32 into vector<4xi32>
%346 = vector.extract %341[2, 0] : vector<4x1xi32>
%347 = vector.insert %346, %345 [2] : i32 into vector<4xi32>
%348 = vector.extract %341[3, 0] : vector<4x1xi32>
%349 = vector.insert %348, %347 [3] : i32 into vector<4xi32>
%350 = vector.extract %arg7[2] : vector<4xi32>
%351 = arith.muli %38, %349 : vector<4xi32>
%352 = vector.reduction <add>, %351, %350 : vector<4xi32> into i32
%353 = vector.insert %352, %cst_0 [0] : i32 into vector<1xi32>
%354 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%355 = vector.insert %354, %cst [0] : vector<1xi8> into vector<4x1xi8>
%356 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%357 = vector.insert %356, %355 [1] : vector<1xi8> into vector<4x1xi8>
%358 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%359 = vector.insert %358, %357 [2] : vector<1xi8> into vector<4x1xi8>
%360 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%361 = vector.insert %360, %359 [3] : vector<1xi8> into vector<4x1xi8>
%362 = arith.extsi %361 : vector<4x1xi8> to vector<4x1xi32>
%363 = vector.extract %362[0, 0] : vector<4x1xi32>
%364 = vector.insert %363, %cst_1 [0] : i32 into vector<4xi32>
%365 = vector.extract %362[1, 0] : vector<4x1xi32>
%366 = vector.insert %365, %364 [1] : i32 into vector<4xi32>
%367 = vector.extract %362[2, 0] : vector<4x1xi32>
%368 = vector.insert %367, %366 [2] : i32 into vector<4xi32>
%369 = vector.extract %362[3, 0] : vector<4x1xi32>
%370 = vector.insert %369, %368 [3] : i32 into vector<4xi32>
%371 = vector.extract %arg7[3] : vector<4xi32>
%372 = arith.muli %38, %370 : vector<4xi32>
%373 = vector.reduction <add>, %372, %371 : vector<4xi32> into i32
%374 = vector.insert %373, %cst_0 [0] : i32 into vector<1xi32>
%375 = vector.insert_strided_slice %59, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%376 = vector.insert_strided_slice %80, %375 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%377 = vector.insert_strided_slice %101, %376 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%378 = vector.insert_strided_slice %122, %377 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%379 = vector.insert_strided_slice %143, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%380 = vector.insert_strided_slice %164, %379 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%381 = vector.insert_strided_slice %185, %380 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%382 = vector.insert_strided_slice %206, %381 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%383 = vector.insert_strided_slice %227, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%384 = vector.insert_strided_slice %248, %383 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%385 = vector.insert_strided_slice %269, %384 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%386 = vector.insert_strided_slice %290, %385 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%387 = vector.insert_strided_slice %311, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%388 = vector.insert_strided_slice %332, %387 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%389 = vector.insert_strided_slice %353, %388 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%390 = vector.insert_strided_slice %374, %389 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
scf.yield %390, %386, %382, %378 : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>
}
%20 = vector.transfer_write %19#3, %18[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%21 = vector.transfer_write %19#2, %20[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%22 = vector.transfer_write %19#1, %21[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%23 = vector.transfer_write %19#0, %22[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%inserted_slice = tensor.insert_slice %23 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %14 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
--- After lowering transfer ops ---
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%cst = arith.constant dense<0> : vector<4x1xi8>
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<4xi32>
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c0_i8 = arith.constant 0 : i8
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c1024 step %4 {
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
scf.for %arg1 = %5 to %c1024 step %6 {
%8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %8) -> (tensor<8x32xi32>) {
%11 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
%12 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2)
%14 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%15 = vector.transfer_write %cst_1, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%16 = vector.transfer_write %cst_1, %15[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%17 = vector.transfer_write %cst_1, %16[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%18 = vector.transfer_write %cst_1, %17[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%19:4 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %cst_1, %arg8 = %cst_1, %arg9 = %cst_1, %arg10 = %cst_1) -> (vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>) {
%24 = vector.transfer_read %7[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%25 = vector.transfer_read %7[%11, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%26 = vector.transfer_read %7[%12, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%27 = vector.transfer_read %7[%13, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%28 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%29 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6)
%30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%31 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6)
%32 = vector.transfer_read %9[%31, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%33 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6)
%34 = vector.transfer_read %9[%33, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%35 = arith.extsi %24 : vector<4xi8> to vector<4xi32>
%36 = arith.extsi %25 : vector<4xi8> to vector<4xi32>
%37 = arith.extsi %26 : vector<4xi8> to vector<4xi32>
%38 = arith.extsi %27 : vector<4xi8> to vector<4xi32>
%39 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%40 = vector.insert %39, %cst [0] : vector<1xi8> into vector<4x1xi8>
%41 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%42 = vector.insert %41, %40 [1] : vector<1xi8> into vector<4x1xi8>
%43 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%44 = vector.insert %43, %42 [2] : vector<1xi8> into vector<4x1xi8>
%45 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%46 = vector.insert %45, %44 [3] : vector<1xi8> into vector<4x1xi8>
%47 = arith.extsi %46 : vector<4x1xi8> to vector<4x1xi32>
%48 = vector.extract %47[0, 0] : vector<4x1xi32>
%49 = vector.insert %48, %cst_1 [0] : i32 into vector<4xi32>
%50 = vector.extract %47[1, 0] : vector<4x1xi32>
%51 = vector.insert %50, %49 [1] : i32 into vector<4xi32>
%52 = vector.extract %47[2, 0] : vector<4x1xi32>
%53 = vector.insert %52, %51 [2] : i32 into vector<4xi32>
%54 = vector.extract %47[3, 0] : vector<4x1xi32>
%55 = vector.insert %54, %53 [3] : i32 into vector<4xi32>
%56 = vector.extract %arg10[0] : vector<4xi32>
%57 = arith.muli %35, %55 : vector<4xi32>
%58 = vector.reduction <add>, %57, %56 : vector<4xi32> into i32
%59 = vector.insert %58, %cst_0 [0] : i32 into vector<1xi32>
%60 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%61 = vector.insert %60, %cst [0] : vector<1xi8> into vector<4x1xi8>
%62 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%63 = vector.insert %62, %61 [1] : vector<1xi8> into vector<4x1xi8>
%64 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%65 = vector.insert %64, %63 [2] : vector<1xi8> into vector<4x1xi8>
%66 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%67 = vector.insert %66, %65 [3] : vector<1xi8> into vector<4x1xi8>
%68 = arith.extsi %67 : vector<4x1xi8> to vector<4x1xi32>
%69 = vector.extract %68[0, 0] : vector<4x1xi32>
%70 = vector.insert %69, %cst_1 [0] : i32 into vector<4xi32>
%71 = vector.extract %68[1, 0] : vector<4x1xi32>
%72 = vector.insert %71, %70 [1] : i32 into vector<4xi32>
%73 = vector.extract %68[2, 0] : vector<4x1xi32>
%74 = vector.insert %73, %72 [2] : i32 into vector<4xi32>
%75 = vector.extract %68[3, 0] : vector<4x1xi32>
%76 = vector.insert %75, %74 [3] : i32 into vector<4xi32>
%77 = vector.extract %arg10[1] : vector<4xi32>
%78 = arith.muli %35, %76 : vector<4xi32>
%79 = vector.reduction <add>, %78, %77 : vector<4xi32> into i32
%80 = vector.insert %79, %cst_0 [0] : i32 into vector<1xi32>
%81 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%82 = vector.insert %81, %cst [0] : vector<1xi8> into vector<4x1xi8>
%83 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%84 = vector.insert %83, %82 [1] : vector<1xi8> into vector<4x1xi8>
%85 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%86 = vector.insert %85, %84 [2] : vector<1xi8> into vector<4x1xi8>
%87 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%88 = vector.insert %87, %86 [3] : vector<1xi8> into vector<4x1xi8>
%89 = arith.extsi %88 : vector<4x1xi8> to vector<4x1xi32>
%90 = vector.extract %89[0, 0] : vector<4x1xi32>
%91 = vector.insert %90, %cst_1 [0] : i32 into vector<4xi32>
%92 = vector.extract %89[1, 0] : vector<4x1xi32>
%93 = vector.insert %92, %91 [1] : i32 into vector<4xi32>
%94 = vector.extract %89[2, 0] : vector<4x1xi32>
%95 = vector.insert %94, %93 [2] : i32 into vector<4xi32>
%96 = vector.extract %89[3, 0] : vector<4x1xi32>
%97 = vector.insert %96, %95 [3] : i32 into vector<4xi32>
%98 = vector.extract %arg10[2] : vector<4xi32>
%99 = arith.muli %35, %97 : vector<4xi32>
%100 = vector.reduction <add>, %99, %98 : vector<4xi32> into i32
%101 = vector.insert %100, %cst_0 [0] : i32 into vector<1xi32>
%102 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%103 = vector.insert %102, %cst [0] : vector<1xi8> into vector<4x1xi8>
%104 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%105 = vector.insert %104, %103 [1] : vector<1xi8> into vector<4x1xi8>
%106 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%107 = vector.insert %106, %105 [2] : vector<1xi8> into vector<4x1xi8>
%108 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%109 = vector.insert %108, %107 [3] : vector<1xi8> into vector<4x1xi8>
%110 = arith.extsi %109 : vector<4x1xi8> to vector<4x1xi32>
%111 = vector.extract %110[0, 0] : vector<4x1xi32>
%112 = vector.insert %111, %cst_1 [0] : i32 into vector<4xi32>
%113 = vector.extract %110[1, 0] : vector<4x1xi32>
%114 = vector.insert %113, %112 [1] : i32 into vector<4xi32>
%115 = vector.extract %110[2, 0] : vector<4x1xi32>
%116 = vector.insert %115, %114 [2] : i32 into vector<4xi32>
%117 = vector.extract %110[3, 0] : vector<4x1xi32>
%118 = vector.insert %117, %116 [3] : i32 into vector<4xi32>
%119 = vector.extract %arg10[3] : vector<4xi32>
%120 = arith.muli %35, %118 : vector<4xi32>
%121 = vector.reduction <add>, %120, %119 : vector<4xi32> into i32
%122 = vector.insert %121, %cst_0 [0] : i32 into vector<1xi32>
%123 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%124 = vector.insert %123, %cst [0] : vector<1xi8> into vector<4x1xi8>
%125 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%126 = vector.insert %125, %124 [1] : vector<1xi8> into vector<4x1xi8>
%127 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%128 = vector.insert %127, %126 [2] : vector<1xi8> into vector<4x1xi8>
%129 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%130 = vector.insert %129, %128 [3] : vector<1xi8> into vector<4x1xi8>
%131 = arith.extsi %130 : vector<4x1xi8> to vector<4x1xi32>
%132 = vector.extract %131[0, 0] : vector<4x1xi32>
%133 = vector.insert %132, %cst_1 [0] : i32 into vector<4xi32>
%134 = vector.extract %131[1, 0] : vector<4x1xi32>
%135 = vector.insert %134, %133 [1] : i32 into vector<4xi32>
%136 = vector.extract %131[2, 0] : vector<4x1xi32>
%137 = vector.insert %136, %135 [2] : i32 into vector<4xi32>
%138 = vector.extract %131[3, 0] : vector<4x1xi32>
%139 = vector.insert %138, %137 [3] : i32 into vector<4xi32>
%140 = vector.extract %arg9[0] : vector<4xi32>
%141 = arith.muli %36, %139 : vector<4xi32>
%142 = vector.reduction <add>, %141, %140 : vector<4xi32> into i32
%143 = vector.insert %142, %cst_0 [0] : i32 into vector<1xi32>
%144 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%145 = vector.insert %144, %cst [0] : vector<1xi8> into vector<4x1xi8>
%146 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%147 = vector.insert %146, %145 [1] : vector<1xi8> into vector<4x1xi8>
%148 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%149 = vector.insert %148, %147 [2] : vector<1xi8> into vector<4x1xi8>
%150 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%151 = vector.insert %150, %149 [3] : vector<1xi8> into vector<4x1xi8>
%152 = arith.extsi %151 : vector<4x1xi8> to vector<4x1xi32>
%153 = vector.extract %152[0, 0] : vector<4x1xi32>
%154 = vector.insert %153, %cst_1 [0] : i32 into vector<4xi32>
%155 = vector.extract %152[1, 0] : vector<4x1xi32>
%156 = vector.insert %155, %154 [1] : i32 into vector<4xi32>
%157 = vector.extract %152[2, 0] : vector<4x1xi32>
%158 = vector.insert %157, %156 [2] : i32 into vector<4xi32>
%159 = vector.extract %152[3, 0] : vector<4x1xi32>
%160 = vector.insert %159, %158 [3] : i32 into vector<4xi32>
%161 = vector.extract %arg9[1] : vector<4xi32>
%162 = arith.muli %36, %160 : vector<4xi32>
%163 = vector.reduction <add>, %162, %161 : vector<4xi32> into i32
%164 = vector.insert %163, %cst_0 [0] : i32 into vector<1xi32>
%165 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%166 = vector.insert %165, %cst [0] : vector<1xi8> into vector<4x1xi8>
%167 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%168 = vector.insert %167, %166 [1] : vector<1xi8> into vector<4x1xi8>
%169 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%170 = vector.insert %169, %168 [2] : vector<1xi8> into vector<4x1xi8>
%171 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%172 = vector.insert %171, %170 [3] : vector<1xi8> into vector<4x1xi8>
%173 = arith.extsi %172 : vector<4x1xi8> to vector<4x1xi32>
%174 = vector.extract %173[0, 0] : vector<4x1xi32>
%175 = vector.insert %174, %cst_1 [0] : i32 into vector<4xi32>
%176 = vector.extract %173[1, 0] : vector<4x1xi32>
%177 = vector.insert %176, %175 [1] : i32 into vector<4xi32>
%178 = vector.extract %173[2, 0] : vector<4x1xi32>
%179 = vector.insert %178, %177 [2] : i32 into vector<4xi32>
%180 = vector.extract %173[3, 0] : vector<4x1xi32>
%181 = vector.insert %180, %179 [3] : i32 into vector<4xi32>
%182 = vector.extract %arg9[2] : vector<4xi32>
%183 = arith.muli %36, %181 : vector<4xi32>
%184 = vector.reduction <add>, %183, %182 : vector<4xi32> into i32
%185 = vector.insert %184, %cst_0 [0] : i32 into vector<1xi32>
%186 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%187 = vector.insert %186, %cst [0] : vector<1xi8> into vector<4x1xi8>
%188 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%189 = vector.insert %188, %187 [1] : vector<1xi8> into vector<4x1xi8>
%190 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%191 = vector.insert %190, %189 [2] : vector<1xi8> into vector<4x1xi8>
%192 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%193 = vector.insert %192, %191 [3] : vector<1xi8> into vector<4x1xi8>
%194 = arith.extsi %193 : vector<4x1xi8> to vector<4x1xi32>
%195 = vector.extract %194[0, 0] : vector<4x1xi32>
%196 = vector.insert %195, %cst_1 [0] : i32 into vector<4xi32>
%197 = vector.extract %194[1, 0] : vector<4x1xi32>
%198 = vector.insert %197, %196 [1] : i32 into vector<4xi32>
%199 = vector.extract %194[2, 0] : vector<4x1xi32>
%200 = vector.insert %199, %198 [2] : i32 into vector<4xi32>
%201 = vector.extract %194[3, 0] : vector<4x1xi32>
%202 = vector.insert %201, %200 [3] : i32 into vector<4xi32>
%203 = vector.extract %arg9[3] : vector<4xi32>
%204 = arith.muli %36, %202 : vector<4xi32>
%205 = vector.reduction <add>, %204, %203 : vector<4xi32> into i32
%206 = vector.insert %205, %cst_0 [0] : i32 into vector<1xi32>
%207 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%208 = vector.insert %207, %cst [0] : vector<1xi8> into vector<4x1xi8>
%209 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%210 = vector.insert %209, %208 [1] : vector<1xi8> into vector<4x1xi8>
%211 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%212 = vector.insert %211, %210 [2] : vector<1xi8> into vector<4x1xi8>
%213 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%214 = vector.insert %213, %212 [3] : vector<1xi8> into vector<4x1xi8>
%215 = arith.extsi %214 : vector<4x1xi8> to vector<4x1xi32>
%216 = vector.extract %215[0, 0] : vector<4x1xi32>
%217 = vector.insert %216, %cst_1 [0] : i32 into vector<4xi32>
%218 = vector.extract %215[1, 0] : vector<4x1xi32>
%219 = vector.insert %218, %217 [1] : i32 into vector<4xi32>
%220 = vector.extract %215[2, 0] : vector<4x1xi32>
%221 = vector.insert %220, %219 [2] : i32 into vector<4xi32>
%222 = vector.extract %215[3, 0] : vector<4x1xi32>
%223 = vector.insert %222, %221 [3] : i32 into vector<4xi32>
%224 = vector.extract %arg8[0] : vector<4xi32>
%225 = arith.muli %37, %223 : vector<4xi32>
%226 = vector.reduction <add>, %225, %224 : vector<4xi32> into i32
%227 = vector.insert %226, %cst_0 [0] : i32 into vector<1xi32>
%228 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%229 = vector.insert %228, %cst [0] : vector<1xi8> into vector<4x1xi8>
%230 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%231 = vector.insert %230, %229 [1] : vector<1xi8> into vector<4x1xi8>
%232 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%233 = vector.insert %232, %231 [2] : vector<1xi8> into vector<4x1xi8>
%234 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%235 = vector.insert %234, %233 [3] : vector<1xi8> into vector<4x1xi8>
%236 = arith.extsi %235 : vector<4x1xi8> to vector<4x1xi32>
%237 = vector.extract %236[0, 0] : vector<4x1xi32>
%238 = vector.insert %237, %cst_1 [0] : i32 into vector<4xi32>
%239 = vector.extract %236[1, 0] : vector<4x1xi32>
%240 = vector.insert %239, %238 [1] : i32 into vector<4xi32>
%241 = vector.extract %236[2, 0] : vector<4x1xi32>
%242 = vector.insert %241, %240 [2] : i32 into vector<4xi32>
%243 = vector.extract %236[3, 0] : vector<4x1xi32>
%244 = vector.insert %243, %242 [3] : i32 into vector<4xi32>
%245 = vector.extract %arg8[1] : vector<4xi32>
%246 = arith.muli %37, %244 : vector<4xi32>
%247 = vector.reduction <add>, %246, %245 : vector<4xi32> into i32
%248 = vector.insert %247, %cst_0 [0] : i32 into vector<1xi32>
%249 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%250 = vector.insert %249, %cst [0] : vector<1xi8> into vector<4x1xi8>
%251 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%252 = vector.insert %251, %250 [1] : vector<1xi8> into vector<4x1xi8>
%253 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%254 = vector.insert %253, %252 [2] : vector<1xi8> into vector<4x1xi8>
%255 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%256 = vector.insert %255, %254 [3] : vector<1xi8> into vector<4x1xi8>
%257 = arith.extsi %256 : vector<4x1xi8> to vector<4x1xi32>
%258 = vector.extract %257[0, 0] : vector<4x1xi32>
%259 = vector.insert %258, %cst_1 [0] : i32 into vector<4xi32>
%260 = vector.extract %257[1, 0] : vector<4x1xi32>
%261 = vector.insert %260, %259 [1] : i32 into vector<4xi32>
%262 = vector.extract %257[2, 0] : vector<4x1xi32>
%263 = vector.insert %262, %261 [2] : i32 into vector<4xi32>
%264 = vector.extract %257[3, 0] : vector<4x1xi32>
%265 = vector.insert %264, %263 [3] : i32 into vector<4xi32>
%266 = vector.extract %arg8[2] : vector<4xi32>
%267 = arith.muli %37, %265 : vector<4xi32>
%268 = vector.reduction <add>, %267, %266 : vector<4xi32> into i32
%269 = vector.insert %268, %cst_0 [0] : i32 into vector<1xi32>
%270 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%271 = vector.insert %270, %cst [0] : vector<1xi8> into vector<4x1xi8>
%272 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%273 = vector.insert %272, %271 [1] : vector<1xi8> into vector<4x1xi8>
%274 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%275 = vector.insert %274, %273 [2] : vector<1xi8> into vector<4x1xi8>
%276 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%277 = vector.insert %276, %275 [3] : vector<1xi8> into vector<4x1xi8>
%278 = arith.extsi %277 : vector<4x1xi8> to vector<4x1xi32>
%279 = vector.extract %278[0, 0] : vector<4x1xi32>
%280 = vector.insert %279, %cst_1 [0] : i32 into vector<4xi32>
%281 = vector.extract %278[1, 0] : vector<4x1xi32>
%282 = vector.insert %281, %280 [1] : i32 into vector<4xi32>
%283 = vector.extract %278[2, 0] : vector<4x1xi32>
%284 = vector.insert %283, %282 [2] : i32 into vector<4xi32>
%285 = vector.extract %278[3, 0] : vector<4x1xi32>
%286 = vector.insert %285, %284 [3] : i32 into vector<4xi32>
%287 = vector.extract %arg8[3] : vector<4xi32>
%288 = arith.muli %37, %286 : vector<4xi32>
%289 = vector.reduction <add>, %288, %287 : vector<4xi32> into i32
%290 = vector.insert %289, %cst_0 [0] : i32 into vector<1xi32>
%291 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%292 = vector.insert %291, %cst [0] : vector<1xi8> into vector<4x1xi8>
%293 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%294 = vector.insert %293, %292 [1] : vector<1xi8> into vector<4x1xi8>
%295 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%296 = vector.insert %295, %294 [2] : vector<1xi8> into vector<4x1xi8>
%297 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%298 = vector.insert %297, %296 [3] : vector<1xi8> into vector<4x1xi8>
%299 = arith.extsi %298 : vector<4x1xi8> to vector<4x1xi32>
%300 = vector.extract %299[0, 0] : vector<4x1xi32>
%301 = vector.insert %300, %cst_1 [0] : i32 into vector<4xi32>
%302 = vector.extract %299[1, 0] : vector<4x1xi32>
%303 = vector.insert %302, %301 [1] : i32 into vector<4xi32>
%304 = vector.extract %299[2, 0] : vector<4x1xi32>
%305 = vector.insert %304, %303 [2] : i32 into vector<4xi32>
%306 = vector.extract %299[3, 0] : vector<4x1xi32>
%307 = vector.insert %306, %305 [3] : i32 into vector<4xi32>
%308 = vector.extract %arg7[0] : vector<4xi32>
%309 = arith.muli %38, %307 : vector<4xi32>
%310 = vector.reduction <add>, %309, %308 : vector<4xi32> into i32
%311 = vector.insert %310, %cst_0 [0] : i32 into vector<1xi32>
%312 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%313 = vector.insert %312, %cst [0] : vector<1xi8> into vector<4x1xi8>
%314 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%315 = vector.insert %314, %313 [1] : vector<1xi8> into vector<4x1xi8>
%316 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%317 = vector.insert %316, %315 [2] : vector<1xi8> into vector<4x1xi8>
%318 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%319 = vector.insert %318, %317 [3] : vector<1xi8> into vector<4x1xi8>
%320 = arith.extsi %319 : vector<4x1xi8> to vector<4x1xi32>
%321 = vector.extract %320[0, 0] : vector<4x1xi32>
%322 = vector.insert %321, %cst_1 [0] : i32 into vector<4xi32>
%323 = vector.extract %320[1, 0] : vector<4x1xi32>
%324 = vector.insert %323, %322 [1] : i32 into vector<4xi32>
%325 = vector.extract %320[2, 0] : vector<4x1xi32>
%326 = vector.insert %325, %324 [2] : i32 into vector<4xi32>
%327 = vector.extract %320[3, 0] : vector<4x1xi32>
%328 = vector.insert %327, %326 [3] : i32 into vector<4xi32>
%329 = vector.extract %arg7[1] : vector<4xi32>
%330 = arith.muli %38, %328 : vector<4xi32>
%331 = vector.reduction <add>, %330, %329 : vector<4xi32> into i32
%332 = vector.insert %331, %cst_0 [0] : i32 into vector<1xi32>
%333 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%334 = vector.insert %333, %cst [0] : vector<1xi8> into vector<4x1xi8>
%335 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%336 = vector.insert %335, %334 [1] : vector<1xi8> into vector<4x1xi8>
%337 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%338 = vector.insert %337, %336 [2] : vector<1xi8> into vector<4x1xi8>
%339 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%340 = vector.insert %339, %338 [3] : vector<1xi8> into vector<4x1xi8>
%341 = arith.extsi %340 : vector<4x1xi8> to vector<4x1xi32>
%342 = vector.extract %341[0, 0] : vector<4x1xi32>
%343 = vector.insert %342, %cst_1 [0] : i32 into vector<4xi32>
%344 = vector.extract %341[1, 0] : vector<4x1xi32>
%345 = vector.insert %344, %343 [1] : i32 into vector<4xi32>
%346 = vector.extract %341[2, 0] : vector<4x1xi32>
%347 = vector.insert %346, %345 [2] : i32 into vector<4xi32>
%348 = vector.extract %341[3, 0] : vector<4x1xi32>
%349 = vector.insert %348, %347 [3] : i32 into vector<4xi32>
%350 = vector.extract %arg7[2] : vector<4xi32>
%351 = arith.muli %38, %349 : vector<4xi32>
%352 = vector.reduction <add>, %351, %350 : vector<4xi32> into i32
%353 = vector.insert %352, %cst_0 [0] : i32 into vector<1xi32>
%354 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%355 = vector.insert %354, %cst [0] : vector<1xi8> into vector<4x1xi8>
%356 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%357 = vector.insert %356, %355 [1] : vector<1xi8> into vector<4x1xi8>
%358 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%359 = vector.insert %358, %357 [2] : vector<1xi8> into vector<4x1xi8>
%360 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%361 = vector.insert %360, %359 [3] : vector<1xi8> into vector<4x1xi8>
%362 = arith.extsi %361 : vector<4x1xi8> to vector<4x1xi32>
%363 = vector.extract %362[0, 0] : vector<4x1xi32>
%364 = vector.insert %363, %cst_1 [0] : i32 into vector<4xi32>
%365 = vector.extract %362[1, 0] : vector<4x1xi32>
%366 = vector.insert %365, %364 [1] : i32 into vector<4xi32>
%367 = vector.extract %362[2, 0] : vector<4x1xi32>
%368 = vector.insert %367, %366 [2] : i32 into vector<4xi32>
%369 = vector.extract %362[3, 0] : vector<4x1xi32>
%370 = vector.insert %369, %368 [3] : i32 into vector<4xi32>
%371 = vector.extract %arg7[3] : vector<4xi32>
%372 = arith.muli %38, %370 : vector<4xi32>
%373 = vector.reduction <add>, %372, %371 : vector<4xi32> into i32
%374 = vector.insert %373, %cst_0 [0] : i32 into vector<1xi32>
%375 = vector.insert_strided_slice %59, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%376 = vector.insert_strided_slice %80, %375 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%377 = vector.insert_strided_slice %101, %376 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%378 = vector.insert_strided_slice %122, %377 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%379 = vector.insert_strided_slice %143, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%380 = vector.insert_strided_slice %164, %379 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%381 = vector.insert_strided_slice %185, %380 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%382 = vector.insert_strided_slice %206, %381 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%383 = vector.insert_strided_slice %227, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%384 = vector.insert_strided_slice %248, %383 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%385 = vector.insert_strided_slice %269, %384 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%386 = vector.insert_strided_slice %290, %385 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%387 = vector.insert_strided_slice %311, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%388 = vector.insert_strided_slice %332, %387 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%389 = vector.insert_strided_slice %353, %388 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%390 = vector.insert_strided_slice %374, %389 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
scf.yield %390, %386, %382, %378 : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>
}
%20 = vector.transfer_write %19#3, %18[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%21 = vector.transfer_write %19#2, %20[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%22 = vector.transfer_write %19#1, %21[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%23 = vector.transfer_write %19#0, %22[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%inserted_slice = tensor.insert_slice %23 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %14 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
--- After lowering various vector ops ---
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%cst = arith.constant dense<0> : vector<4x1xi8>
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<4xi32>
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c0_i8 = arith.constant 0 : i8
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c1024 step %4 {
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
scf.for %arg1 = %5 to %c1024 step %6 {
%8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %8) -> (tensor<8x32xi32>) {
%11 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
%12 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2)
%14 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%15 = vector.transfer_write %cst_1, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%16 = vector.transfer_write %cst_1, %15[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%17 = vector.transfer_write %cst_1, %16[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%18 = vector.transfer_write %cst_1, %17[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%19:4 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %cst_1, %arg8 = %cst_1, %arg9 = %cst_1, %arg10 = %cst_1) -> (vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>) {
%24 = vector.transfer_read %7[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%25 = vector.transfer_read %7[%11, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%26 = vector.transfer_read %7[%12, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%27 = vector.transfer_read %7[%13, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%28 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%29 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6)
%30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%31 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6)
%32 = vector.transfer_read %9[%31, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%33 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6)
%34 = vector.transfer_read %9[%33, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%35 = arith.extsi %24 : vector<4xi8> to vector<4xi32>
%36 = arith.extsi %25 : vector<4xi8> to vector<4xi32>
%37 = arith.extsi %26 : vector<4xi8> to vector<4xi32>
%38 = arith.extsi %27 : vector<4xi8> to vector<4xi32>
%39 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%40 = vector.insert %39, %cst [0] : vector<1xi8> into vector<4x1xi8>
%41 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%42 = vector.insert %41, %40 [1] : vector<1xi8> into vector<4x1xi8>
%43 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%44 = vector.insert %43, %42 [2] : vector<1xi8> into vector<4x1xi8>
%45 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%46 = vector.insert %45, %44 [3] : vector<1xi8> into vector<4x1xi8>
%47 = arith.extsi %46 : vector<4x1xi8> to vector<4x1xi32>
%48 = vector.extract %47[0, 0] : vector<4x1xi32>
%49 = vector.insert %48, %cst_1 [0] : i32 into vector<4xi32>
%50 = vector.extract %47[1, 0] : vector<4x1xi32>
%51 = vector.insert %50, %49 [1] : i32 into vector<4xi32>
%52 = vector.extract %47[2, 0] : vector<4x1xi32>
%53 = vector.insert %52, %51 [2] : i32 into vector<4xi32>
%54 = vector.extract %47[3, 0] : vector<4x1xi32>
%55 = vector.insert %54, %53 [3] : i32 into vector<4xi32>
%56 = vector.extract %arg10[0] : vector<4xi32>
%57 = arith.muli %35, %55 : vector<4xi32>
%58 = vector.reduction <add>, %57, %56 : vector<4xi32> into i32
%59 = vector.insert %58, %cst_0 [0] : i32 into vector<1xi32>
%60 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%61 = vector.insert %60, %cst [0] : vector<1xi8> into vector<4x1xi8>
%62 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%63 = vector.insert %62, %61 [1] : vector<1xi8> into vector<4x1xi8>
%64 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%65 = vector.insert %64, %63 [2] : vector<1xi8> into vector<4x1xi8>
%66 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%67 = vector.insert %66, %65 [3] : vector<1xi8> into vector<4x1xi8>
%68 = arith.extsi %67 : vector<4x1xi8> to vector<4x1xi32>
%69 = vector.extract %68[0, 0] : vector<4x1xi32>
%70 = vector.insert %69, %cst_1 [0] : i32 into vector<4xi32>
%71 = vector.extract %68[1, 0] : vector<4x1xi32>
%72 = vector.insert %71, %70 [1] : i32 into vector<4xi32>
%73 = vector.extract %68[2, 0] : vector<4x1xi32>
%74 = vector.insert %73, %72 [2] : i32 into vector<4xi32>
%75 = vector.extract %68[3, 0] : vector<4x1xi32>
%76 = vector.insert %75, %74 [3] : i32 into vector<4xi32>
%77 = vector.extract %arg10[1] : vector<4xi32>
%78 = arith.muli %35, %76 : vector<4xi32>
%79 = vector.reduction <add>, %78, %77 : vector<4xi32> into i32
%80 = vector.insert %79, %cst_0 [0] : i32 into vector<1xi32>
%81 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%82 = vector.insert %81, %cst [0] : vector<1xi8> into vector<4x1xi8>
%83 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%84 = vector.insert %83, %82 [1] : vector<1xi8> into vector<4x1xi8>
%85 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%86 = vector.insert %85, %84 [2] : vector<1xi8> into vector<4x1xi8>
%87 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%88 = vector.insert %87, %86 [3] : vector<1xi8> into vector<4x1xi8>
%89 = arith.extsi %88 : vector<4x1xi8> to vector<4x1xi32>
%90 = vector.extract %89[0, 0] : vector<4x1xi32>
%91 = vector.insert %90, %cst_1 [0] : i32 into vector<4xi32>
%92 = vector.extract %89[1, 0] : vector<4x1xi32>
%93 = vector.insert %92, %91 [1] : i32 into vector<4xi32>
%94 = vector.extract %89[2, 0] : vector<4x1xi32>
%95 = vector.insert %94, %93 [2] : i32 into vector<4xi32>
%96 = vector.extract %89[3, 0] : vector<4x1xi32>
%97 = vector.insert %96, %95 [3] : i32 into vector<4xi32>
%98 = vector.extract %arg10[2] : vector<4xi32>
%99 = arith.muli %35, %97 : vector<4xi32>
%100 = vector.reduction <add>, %99, %98 : vector<4xi32> into i32
%101 = vector.insert %100, %cst_0 [0] : i32 into vector<1xi32>
%102 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%103 = vector.insert %102, %cst [0] : vector<1xi8> into vector<4x1xi8>
%104 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%105 = vector.insert %104, %103 [1] : vector<1xi8> into vector<4x1xi8>
%106 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%107 = vector.insert %106, %105 [2] : vector<1xi8> into vector<4x1xi8>
%108 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%109 = vector.insert %108, %107 [3] : vector<1xi8> into vector<4x1xi8>
%110 = arith.extsi %109 : vector<4x1xi8> to vector<4x1xi32>
%111 = vector.extract %110[0, 0] : vector<4x1xi32>
%112 = vector.insert %111, %cst_1 [0] : i32 into vector<4xi32>
%113 = vector.extract %110[1, 0] : vector<4x1xi32>
%114 = vector.insert %113, %112 [1] : i32 into vector<4xi32>
%115 = vector.extract %110[2, 0] : vector<4x1xi32>
%116 = vector.insert %115, %114 [2] : i32 into vector<4xi32>
%117 = vector.extract %110[3, 0] : vector<4x1xi32>
%118 = vector.insert %117, %116 [3] : i32 into vector<4xi32>
%119 = vector.extract %arg10[3] : vector<4xi32>
%120 = arith.muli %35, %118 : vector<4xi32>
%121 = vector.reduction <add>, %120, %119 : vector<4xi32> into i32
%122 = vector.insert %121, %cst_0 [0] : i32 into vector<1xi32>
%123 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%124 = vector.insert %123, %cst [0] : vector<1xi8> into vector<4x1xi8>
%125 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%126 = vector.insert %125, %124 [1] : vector<1xi8> into vector<4x1xi8>
%127 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%128 = vector.insert %127, %126 [2] : vector<1xi8> into vector<4x1xi8>
%129 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%130 = vector.insert %129, %128 [3] : vector<1xi8> into vector<4x1xi8>
%131 = arith.extsi %130 : vector<4x1xi8> to vector<4x1xi32>
%132 = vector.extract %131[0, 0] : vector<4x1xi32>
%133 = vector.insert %132, %cst_1 [0] : i32 into vector<4xi32>
%134 = vector.extract %131[1, 0] : vector<4x1xi32>
%135 = vector.insert %134, %133 [1] : i32 into vector<4xi32>
%136 = vector.extract %131[2, 0] : vector<4x1xi32>
%137 = vector.insert %136, %135 [2] : i32 into vector<4xi32>
%138 = vector.extract %131[3, 0] : vector<4x1xi32>
%139 = vector.insert %138, %137 [3] : i32 into vector<4xi32>
%140 = vector.extract %arg9[0] : vector<4xi32>
%141 = arith.muli %36, %139 : vector<4xi32>
%142 = vector.reduction <add>, %141, %140 : vector<4xi32> into i32
%143 = vector.insert %142, %cst_0 [0] : i32 into vector<1xi32>
%144 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%145 = vector.insert %144, %cst [0] : vector<1xi8> into vector<4x1xi8>
%146 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%147 = vector.insert %146, %145 [1] : vector<1xi8> into vector<4x1xi8>
%148 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%149 = vector.insert %148, %147 [2] : vector<1xi8> into vector<4x1xi8>
%150 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%151 = vector.insert %150, %149 [3] : vector<1xi8> into vector<4x1xi8>
%152 = arith.extsi %151 : vector<4x1xi8> to vector<4x1xi32>
%153 = vector.extract %152[0, 0] : vector<4x1xi32>
%154 = vector.insert %153, %cst_1 [0] : i32 into vector<4xi32>
%155 = vector.extract %152[1, 0] : vector<4x1xi32>
%156 = vector.insert %155, %154 [1] : i32 into vector<4xi32>
%157 = vector.extract %152[2, 0] : vector<4x1xi32>
%158 = vector.insert %157, %156 [2] : i32 into vector<4xi32>
%159 = vector.extract %152[3, 0] : vector<4x1xi32>
%160 = vector.insert %159, %158 [3] : i32 into vector<4xi32>
%161 = vector.extract %arg9[1] : vector<4xi32>
%162 = arith.muli %36, %160 : vector<4xi32>
%163 = vector.reduction <add>, %162, %161 : vector<4xi32> into i32
%164 = vector.insert %163, %cst_0 [0] : i32 into vector<1xi32>
%165 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%166 = vector.insert %165, %cst [0] : vector<1xi8> into vector<4x1xi8>
%167 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%168 = vector.insert %167, %166 [1] : vector<1xi8> into vector<4x1xi8>
%169 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%170 = vector.insert %169, %168 [2] : vector<1xi8> into vector<4x1xi8>
%171 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%172 = vector.insert %171, %170 [3] : vector<1xi8> into vector<4x1xi8>
%173 = arith.extsi %172 : vector<4x1xi8> to vector<4x1xi32>
%174 = vector.extract %173[0, 0] : vector<4x1xi32>
%175 = vector.insert %174, %cst_1 [0] : i32 into vector<4xi32>
%176 = vector.extract %173[1, 0] : vector<4x1xi32>
%177 = vector.insert %176, %175 [1] : i32 into vector<4xi32>
%178 = vector.extract %173[2, 0] : vector<4x1xi32>
%179 = vector.insert %178, %177 [2] : i32 into vector<4xi32>
%180 = vector.extract %173[3, 0] : vector<4x1xi32>
%181 = vector.insert %180, %179 [3] : i32 into vector<4xi32>
%182 = vector.extract %arg9[2] : vector<4xi32>
%183 = arith.muli %36, %181 : vector<4xi32>
%184 = vector.reduction <add>, %183, %182 : vector<4xi32> into i32
%185 = vector.insert %184, %cst_0 [0] : i32 into vector<1xi32>
%186 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%187 = vector.insert %186, %cst [0] : vector<1xi8> into vector<4x1xi8>
%188 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%189 = vector.insert %188, %187 [1] : vector<1xi8> into vector<4x1xi8>
%190 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%191 = vector.insert %190, %189 [2] : vector<1xi8> into vector<4x1xi8>
%192 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%193 = vector.insert %192, %191 [3] : vector<1xi8> into vector<4x1xi8>
%194 = arith.extsi %193 : vector<4x1xi8> to vector<4x1xi32>
%195 = vector.extract %194[0, 0] : vector<4x1xi32>
%196 = vector.insert %195, %cst_1 [0] : i32 into vector<4xi32>
%197 = vector.extract %194[1, 0] : vector<4x1xi32>
%198 = vector.insert %197, %196 [1] : i32 into vector<4xi32>
%199 = vector.extract %194[2, 0] : vector<4x1xi32>
%200 = vector.insert %199, %198 [2] : i32 into vector<4xi32>
%201 = vector.extract %194[3, 0] : vector<4x1xi32>
%202 = vector.insert %201, %200 [3] : i32 into vector<4xi32>
%203 = vector.extract %arg9[3] : vector<4xi32>
%204 = arith.muli %36, %202 : vector<4xi32>
%205 = vector.reduction <add>, %204, %203 : vector<4xi32> into i32
%206 = vector.insert %205, %cst_0 [0] : i32 into vector<1xi32>
%207 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%208 = vector.insert %207, %cst [0] : vector<1xi8> into vector<4x1xi8>
%209 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%210 = vector.insert %209, %208 [1] : vector<1xi8> into vector<4x1xi8>
%211 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%212 = vector.insert %211, %210 [2] : vector<1xi8> into vector<4x1xi8>
%213 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%214 = vector.insert %213, %212 [3] : vector<1xi8> into vector<4x1xi8>
%215 = arith.extsi %214 : vector<4x1xi8> to vector<4x1xi32>
%216 = vector.extract %215[0, 0] : vector<4x1xi32>
%217 = vector.insert %216, %cst_1 [0] : i32 into vector<4xi32>
%218 = vector.extract %215[1, 0] : vector<4x1xi32>
%219 = vector.insert %218, %217 [1] : i32 into vector<4xi32>
%220 = vector.extract %215[2, 0] : vector<4x1xi32>
%221 = vector.insert %220, %219 [2] : i32 into vector<4xi32>
%222 = vector.extract %215[3, 0] : vector<4x1xi32>
%223 = vector.insert %222, %221 [3] : i32 into vector<4xi32>
%224 = vector.extract %arg8[0] : vector<4xi32>
%225 = arith.muli %37, %223 : vector<4xi32>
%226 = vector.reduction <add>, %225, %224 : vector<4xi32> into i32
%227 = vector.insert %226, %cst_0 [0] : i32 into vector<1xi32>
%228 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%229 = vector.insert %228, %cst [0] : vector<1xi8> into vector<4x1xi8>
%230 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%231 = vector.insert %230, %229 [1] : vector<1xi8> into vector<4x1xi8>
%232 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%233 = vector.insert %232, %231 [2] : vector<1xi8> into vector<4x1xi8>
%234 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%235 = vector.insert %234, %233 [3] : vector<1xi8> into vector<4x1xi8>
%236 = arith.extsi %235 : vector<4x1xi8> to vector<4x1xi32>
%237 = vector.extract %236[0, 0] : vector<4x1xi32>
%238 = vector.insert %237, %cst_1 [0] : i32 into vector<4xi32>
%239 = vector.extract %236[1, 0] : vector<4x1xi32>
%240 = vector.insert %239, %238 [1] : i32 into vector<4xi32>
%241 = vector.extract %236[2, 0] : vector<4x1xi32>
%242 = vector.insert %241, %240 [2] : i32 into vector<4xi32>
%243 = vector.extract %236[3, 0] : vector<4x1xi32>
%244 = vector.insert %243, %242 [3] : i32 into vector<4xi32>
%245 = vector.extract %arg8[1] : vector<4xi32>
%246 = arith.muli %37, %244 : vector<4xi32>
%247 = vector.reduction <add>, %246, %245 : vector<4xi32> into i32
%248 = vector.insert %247, %cst_0 [0] : i32 into vector<1xi32>
%249 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%250 = vector.insert %249, %cst [0] : vector<1xi8> into vector<4x1xi8>
%251 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%252 = vector.insert %251, %250 [1] : vector<1xi8> into vector<4x1xi8>
%253 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%254 = vector.insert %253, %252 [2] : vector<1xi8> into vector<4x1xi8>
%255 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%256 = vector.insert %255, %254 [3] : vector<1xi8> into vector<4x1xi8>
%257 = arith.extsi %256 : vector<4x1xi8> to vector<4x1xi32>
%258 = vector.extract %257[0, 0] : vector<4x1xi32>
%259 = vector.insert %258, %cst_1 [0] : i32 into vector<4xi32>
%260 = vector.extract %257[1, 0] : vector<4x1xi32>
%261 = vector.insert %260, %259 [1] : i32 into vector<4xi32>
%262 = vector.extract %257[2, 0] : vector<4x1xi32>
%263 = vector.insert %262, %261 [2] : i32 into vector<4xi32>
%264 = vector.extract %257[3, 0] : vector<4x1xi32>
%265 = vector.insert %264, %263 [3] : i32 into vector<4xi32>
%266 = vector.extract %arg8[2] : vector<4xi32>
%267 = arith.muli %37, %265 : vector<4xi32>
%268 = vector.reduction <add>, %267, %266 : vector<4xi32> into i32
%269 = vector.insert %268, %cst_0 [0] : i32 into vector<1xi32>
%270 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%271 = vector.insert %270, %cst [0] : vector<1xi8> into vector<4x1xi8>
%272 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%273 = vector.insert %272, %271 [1] : vector<1xi8> into vector<4x1xi8>
%274 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%275 = vector.insert %274, %273 [2] : vector<1xi8> into vector<4x1xi8>
%276 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%277 = vector.insert %276, %275 [3] : vector<1xi8> into vector<4x1xi8>
%278 = arith.extsi %277 : vector<4x1xi8> to vector<4x1xi32>
%279 = vector.extract %278[0, 0] : vector<4x1xi32>
%280 = vector.insert %279, %cst_1 [0] : i32 into vector<4xi32>
%281 = vector.extract %278[1, 0] : vector<4x1xi32>
%282 = vector.insert %281, %280 [1] : i32 into vector<4xi32>
%283 = vector.extract %278[2, 0] : vector<4x1xi32>
%284 = vector.insert %283, %282 [2] : i32 into vector<4xi32>
%285 = vector.extract %278[3, 0] : vector<4x1xi32>
%286 = vector.insert %285, %284 [3] : i32 into vector<4xi32>
%287 = vector.extract %arg8[3] : vector<4xi32>
%288 = arith.muli %37, %286 : vector<4xi32>
%289 = vector.reduction <add>, %288, %287 : vector<4xi32> into i32
%290 = vector.insert %289, %cst_0 [0] : i32 into vector<1xi32>
%291 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%292 = vector.insert %291, %cst [0] : vector<1xi8> into vector<4x1xi8>
%293 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%294 = vector.insert %293, %292 [1] : vector<1xi8> into vector<4x1xi8>
%295 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%296 = vector.insert %295, %294 [2] : vector<1xi8> into vector<4x1xi8>
%297 = vector.extract_strided_slice %34 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%298 = vector.insert %297, %296 [3] : vector<1xi8> into vector<4x1xi8>
%299 = arith.extsi %298 : vector<4x1xi8> to vector<4x1xi32>
%300 = vector.extract %299[0, 0] : vector<4x1xi32>
%301 = vector.insert %300, %cst_1 [0] : i32 into vector<4xi32>
%302 = vector.extract %299[1, 0] : vector<4x1xi32>
%303 = vector.insert %302, %301 [1] : i32 into vector<4xi32>
%304 = vector.extract %299[2, 0] : vector<4x1xi32>
%305 = vector.insert %304, %303 [2] : i32 into vector<4xi32>
%306 = vector.extract %299[3, 0] : vector<4x1xi32>
%307 = vector.insert %306, %305 [3] : i32 into vector<4xi32>
%308 = vector.extract %arg7[0] : vector<4xi32>
%309 = arith.muli %38, %307 : vector<4xi32>
%310 = vector.reduction <add>, %309, %308 : vector<4xi32> into i32
%311 = vector.insert %310, %cst_0 [0] : i32 into vector<1xi32>
%312 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%313 = vector.insert %312, %cst [0] : vector<1xi8> into vector<4x1xi8>
%314 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%315 = vector.insert %314, %313 [1] : vector<1xi8> into vector<4x1xi8>
%316 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%317 = vector.insert %316, %315 [2] : vector<1xi8> into vector<4x1xi8>
%318 = vector.extract_strided_slice %34 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%319 = vector.insert %318, %317 [3] : vector<1xi8> into vector<4x1xi8>
%320 = arith.extsi %319 : vector<4x1xi8> to vector<4x1xi32>
%321 = vector.extract %320[0, 0] : vector<4x1xi32>
%322 = vector.insert %321, %cst_1 [0] : i32 into vector<4xi32>
%323 = vector.extract %320[1, 0] : vector<4x1xi32>
%324 = vector.insert %323, %322 [1] : i32 into vector<4xi32>
%325 = vector.extract %320[2, 0] : vector<4x1xi32>
%326 = vector.insert %325, %324 [2] : i32 into vector<4xi32>
%327 = vector.extract %320[3, 0] : vector<4x1xi32>
%328 = vector.insert %327, %326 [3] : i32 into vector<4xi32>
%329 = vector.extract %arg7[1] : vector<4xi32>
%330 = arith.muli %38, %328 : vector<4xi32>
%331 = vector.reduction <add>, %330, %329 : vector<4xi32> into i32
%332 = vector.insert %331, %cst_0 [0] : i32 into vector<1xi32>
%333 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%334 = vector.insert %333, %cst [0] : vector<1xi8> into vector<4x1xi8>
%335 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%336 = vector.insert %335, %334 [1] : vector<1xi8> into vector<4x1xi8>
%337 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%338 = vector.insert %337, %336 [2] : vector<1xi8> into vector<4x1xi8>
%339 = vector.extract_strided_slice %34 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%340 = vector.insert %339, %338 [3] : vector<1xi8> into vector<4x1xi8>
%341 = arith.extsi %340 : vector<4x1xi8> to vector<4x1xi32>
%342 = vector.extract %341[0, 0] : vector<4x1xi32>
%343 = vector.insert %342, %cst_1 [0] : i32 into vector<4xi32>
%344 = vector.extract %341[1, 0] : vector<4x1xi32>
%345 = vector.insert %344, %343 [1] : i32 into vector<4xi32>
%346 = vector.extract %341[2, 0] : vector<4x1xi32>
%347 = vector.insert %346, %345 [2] : i32 into vector<4xi32>
%348 = vector.extract %341[3, 0] : vector<4x1xi32>
%349 = vector.insert %348, %347 [3] : i32 into vector<4xi32>
%350 = vector.extract %arg7[2] : vector<4xi32>
%351 = arith.muli %38, %349 : vector<4xi32>
%352 = vector.reduction <add>, %351, %350 : vector<4xi32> into i32
%353 = vector.insert %352, %cst_0 [0] : i32 into vector<1xi32>
%354 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%355 = vector.insert %354, %cst [0] : vector<1xi8> into vector<4x1xi8>
%356 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%357 = vector.insert %356, %355 [1] : vector<1xi8> into vector<4x1xi8>
%358 = vector.extract_strided_slice %32 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%359 = vector.insert %358, %357 [2] : vector<1xi8> into vector<4x1xi8>
%360 = vector.extract_strided_slice %34 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%361 = vector.insert %360, %359 [3] : vector<1xi8> into vector<4x1xi8>
%362 = arith.extsi %361 : vector<4x1xi8> to vector<4x1xi32>
%363 = vector.extract %362[0, 0] : vector<4x1xi32>
%364 = vector.insert %363, %cst_1 [0] : i32 into vector<4xi32>
%365 = vector.extract %362[1, 0] : vector<4x1xi32>
%366 = vector.insert %365, %364 [1] : i32 into vector<4xi32>
%367 = vector.extract %362[2, 0] : vector<4x1xi32>
%368 = vector.insert %367, %366 [2] : i32 into vector<4xi32>
%369 = vector.extract %362[3, 0] : vector<4x1xi32>
%370 = vector.insert %369, %368 [3] : i32 into vector<4xi32>
%371 = vector.extract %arg7[3] : vector<4xi32>
%372 = arith.muli %38, %370 : vector<4xi32>
%373 = vector.reduction <add>, %372, %371 : vector<4xi32> into i32
%374 = vector.insert %373, %cst_0 [0] : i32 into vector<1xi32>
%375 = vector.insert_strided_slice %59, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%376 = vector.insert_strided_slice %80, %375 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%377 = vector.insert_strided_slice %101, %376 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%378 = vector.insert_strided_slice %122, %377 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%379 = vector.insert_strided_slice %143, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%380 = vector.insert_strided_slice %164, %379 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%381 = vector.insert_strided_slice %185, %380 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%382 = vector.insert_strided_slice %206, %381 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%383 = vector.insert_strided_slice %227, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%384 = vector.insert_strided_slice %248, %383 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%385 = vector.insert_strided_slice %269, %384 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%386 = vector.insert_strided_slice %290, %385 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%387 = vector.insert_strided_slice %311, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%388 = vector.insert_strided_slice %332, %387 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%389 = vector.insert_strided_slice %353, %388 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%390 = vector.insert_strided_slice %374, %389 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
scf.yield %390, %386, %382, %378 : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>
}
%20 = vector.transfer_write %19#3, %18[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%21 = vector.transfer_write %19#2, %20[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%22 = vector.transfer_write %19#1, %21[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%23 = vector.transfer_write %19#0, %22[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%inserted_slice = tensor.insert_slice %23 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %14 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
// -----// IR Dump After SPIRVVectorize (iree-spirv-vectorize) //----- //
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%cst = arith.constant dense<0> : vector<4x1xi8>
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<4xi32>
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c0_i8 = arith.constant 0 : i8
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c1024 step %4 {
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
scf.for %arg1 = %5 to %c1024 step %6 {
%8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %8) -> (tensor<8x32xi32>) {
%11 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
%12 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2)
%14 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%15:4 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %cst_1, %arg8 = %cst_1, %arg9 = %cst_1, %arg10 = %cst_1) -> (vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>) {
%20 = vector.transfer_read %7[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%21 = vector.transfer_read %7[%11, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%22 = vector.transfer_read %7[%12, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%23 = vector.transfer_read %7[%13, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%24 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%25 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6)
%26 = vector.transfer_read %9[%25, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%27 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6)
%28 = vector.transfer_read %9[%27, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%29 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6)
%30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%31 = arith.extsi %20 : vector<4xi8> to vector<4xi32>
%32 = arith.extsi %21 : vector<4xi8> to vector<4xi32>
%33 = arith.extsi %22 : vector<4xi8> to vector<4xi32>
%34 = arith.extsi %23 : vector<4xi8> to vector<4xi32>
%35 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%36 = vector.insert %35, %cst [0] : vector<1xi8> into vector<4x1xi8>
%37 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%38 = vector.insert %37, %36 [1] : vector<1xi8> into vector<4x1xi8>
%39 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%40 = vector.insert %39, %38 [2] : vector<1xi8> into vector<4x1xi8>
%41 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%42 = vector.insert %41, %40 [3] : vector<1xi8> into vector<4x1xi8>
%43 = arith.extsi %42 : vector<4x1xi8> to vector<4x1xi32>
%44 = vector.extract %43[0, 0] : vector<4x1xi32>
%45 = vector.insert %44, %cst_1 [0] : i32 into vector<4xi32>
%46 = vector.extract %43[1, 0] : vector<4x1xi32>
%47 = vector.insert %46, %45 [1] : i32 into vector<4xi32>
%48 = vector.extract %43[2, 0] : vector<4x1xi32>
%49 = vector.insert %48, %47 [2] : i32 into vector<4xi32>
%50 = vector.extract %43[3, 0] : vector<4x1xi32>
%51 = vector.insert %50, %49 [3] : i32 into vector<4xi32>
%52 = vector.extract %arg10[0] : vector<4xi32>
%53 = arith.muli %31, %51 : vector<4xi32>
%54 = vector.reduction <add>, %53, %52 : vector<4xi32> into i32
%55 = vector.insert %54, %cst_0 [0] : i32 into vector<1xi32>
%56 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%57 = vector.insert %56, %cst [0] : vector<1xi8> into vector<4x1xi8>
%58 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%59 = vector.insert %58, %57 [1] : vector<1xi8> into vector<4x1xi8>
%60 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%61 = vector.insert %60, %59 [2] : vector<1xi8> into vector<4x1xi8>
%62 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%63 = vector.insert %62, %61 [3] : vector<1xi8> into vector<4x1xi8>
%64 = arith.extsi %63 : vector<4x1xi8> to vector<4x1xi32>
%65 = vector.extract %64[0, 0] : vector<4x1xi32>
%66 = vector.insert %65, %cst_1 [0] : i32 into vector<4xi32>
%67 = vector.extract %64[1, 0] : vector<4x1xi32>
%68 = vector.insert %67, %66 [1] : i32 into vector<4xi32>
%69 = vector.extract %64[2, 0] : vector<4x1xi32>
%70 = vector.insert %69, %68 [2] : i32 into vector<4xi32>
%71 = vector.extract %64[3, 0] : vector<4x1xi32>
%72 = vector.insert %71, %70 [3] : i32 into vector<4xi32>
%73 = vector.extract %arg10[1] : vector<4xi32>
%74 = arith.muli %31, %72 : vector<4xi32>
%75 = vector.reduction <add>, %74, %73 : vector<4xi32> into i32
%76 = vector.insert %75, %cst_0 [0] : i32 into vector<1xi32>
%77 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%78 = vector.insert %77, %cst [0] : vector<1xi8> into vector<4x1xi8>
%79 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%80 = vector.insert %79, %78 [1] : vector<1xi8> into vector<4x1xi8>
%81 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%82 = vector.insert %81, %80 [2] : vector<1xi8> into vector<4x1xi8>
%83 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%84 = vector.insert %83, %82 [3] : vector<1xi8> into vector<4x1xi8>
%85 = arith.extsi %84 : vector<4x1xi8> to vector<4x1xi32>
%86 = vector.extract %85[0, 0] : vector<4x1xi32>
%87 = vector.insert %86, %cst_1 [0] : i32 into vector<4xi32>
%88 = vector.extract %85[1, 0] : vector<4x1xi32>
%89 = vector.insert %88, %87 [1] : i32 into vector<4xi32>
%90 = vector.extract %85[2, 0] : vector<4x1xi32>
%91 = vector.insert %90, %89 [2] : i32 into vector<4xi32>
%92 = vector.extract %85[3, 0] : vector<4x1xi32>
%93 = vector.insert %92, %91 [3] : i32 into vector<4xi32>
%94 = vector.extract %arg10[2] : vector<4xi32>
%95 = arith.muli %31, %93 : vector<4xi32>
%96 = vector.reduction <add>, %95, %94 : vector<4xi32> into i32
%97 = vector.insert %96, %cst_0 [0] : i32 into vector<1xi32>
%98 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%99 = vector.insert %98, %cst [0] : vector<1xi8> into vector<4x1xi8>
%100 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%101 = vector.insert %100, %99 [1] : vector<1xi8> into vector<4x1xi8>
%102 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%103 = vector.insert %102, %101 [2] : vector<1xi8> into vector<4x1xi8>
%104 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%105 = vector.insert %104, %103 [3] : vector<1xi8> into vector<4x1xi8>
%106 = arith.extsi %105 : vector<4x1xi8> to vector<4x1xi32>
%107 = vector.extract %106[0, 0] : vector<4x1xi32>
%108 = vector.insert %107, %cst_1 [0] : i32 into vector<4xi32>
%109 = vector.extract %106[1, 0] : vector<4x1xi32>
%110 = vector.insert %109, %108 [1] : i32 into vector<4xi32>
%111 = vector.extract %106[2, 0] : vector<4x1xi32>
%112 = vector.insert %111, %110 [2] : i32 into vector<4xi32>
%113 = vector.extract %106[3, 0] : vector<4x1xi32>
%114 = vector.insert %113, %112 [3] : i32 into vector<4xi32>
%115 = vector.extract %arg10[3] : vector<4xi32>
%116 = arith.muli %31, %114 : vector<4xi32>
%117 = vector.reduction <add>, %116, %115 : vector<4xi32> into i32
%118 = vector.insert %117, %cst_0 [0] : i32 into vector<1xi32>
%119 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%120 = vector.insert %119, %cst [0] : vector<1xi8> into vector<4x1xi8>
%121 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%122 = vector.insert %121, %120 [1] : vector<1xi8> into vector<4x1xi8>
%123 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%124 = vector.insert %123, %122 [2] : vector<1xi8> into vector<4x1xi8>
%125 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%126 = vector.insert %125, %124 [3] : vector<1xi8> into vector<4x1xi8>
%127 = arith.extsi %126 : vector<4x1xi8> to vector<4x1xi32>
%128 = vector.extract %127[0, 0] : vector<4x1xi32>
%129 = vector.insert %128, %cst_1 [0] : i32 into vector<4xi32>
%130 = vector.extract %127[1, 0] : vector<4x1xi32>
%131 = vector.insert %130, %129 [1] : i32 into vector<4xi32>
%132 = vector.extract %127[2, 0] : vector<4x1xi32>
%133 = vector.insert %132, %131 [2] : i32 into vector<4xi32>
%134 = vector.extract %127[3, 0] : vector<4x1xi32>
%135 = vector.insert %134, %133 [3] : i32 into vector<4xi32>
%136 = vector.extract %arg9[0] : vector<4xi32>
%137 = arith.muli %32, %135 : vector<4xi32>
%138 = vector.reduction <add>, %137, %136 : vector<4xi32> into i32
%139 = vector.insert %138, %cst_0 [0] : i32 into vector<1xi32>
%140 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%141 = vector.insert %140, %cst [0] : vector<1xi8> into vector<4x1xi8>
%142 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%143 = vector.insert %142, %141 [1] : vector<1xi8> into vector<4x1xi8>
%144 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%145 = vector.insert %144, %143 [2] : vector<1xi8> into vector<4x1xi8>
%146 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%147 = vector.insert %146, %145 [3] : vector<1xi8> into vector<4x1xi8>
%148 = arith.extsi %147 : vector<4x1xi8> to vector<4x1xi32>
%149 = vector.extract %148[0, 0] : vector<4x1xi32>
%150 = vector.insert %149, %cst_1 [0] : i32 into vector<4xi32>
%151 = vector.extract %148[1, 0] : vector<4x1xi32>
%152 = vector.insert %151, %150 [1] : i32 into vector<4xi32>
%153 = vector.extract %148[2, 0] : vector<4x1xi32>
%154 = vector.insert %153, %152 [2] : i32 into vector<4xi32>
%155 = vector.extract %148[3, 0] : vector<4x1xi32>
%156 = vector.insert %155, %154 [3] : i32 into vector<4xi32>
%157 = vector.extract %arg9[1] : vector<4xi32>
%158 = arith.muli %32, %156 : vector<4xi32>
%159 = vector.reduction <add>, %158, %157 : vector<4xi32> into i32
%160 = vector.insert %159, %cst_0 [0] : i32 into vector<1xi32>
%161 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%162 = vector.insert %161, %cst [0] : vector<1xi8> into vector<4x1xi8>
%163 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%164 = vector.insert %163, %162 [1] : vector<1xi8> into vector<4x1xi8>
%165 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%166 = vector.insert %165, %164 [2] : vector<1xi8> into vector<4x1xi8>
%167 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%168 = vector.insert %167, %166 [3] : vector<1xi8> into vector<4x1xi8>
%169 = arith.extsi %168 : vector<4x1xi8> to vector<4x1xi32>
%170 = vector.extract %169[0, 0] : vector<4x1xi32>
%171 = vector.insert %170, %cst_1 [0] : i32 into vector<4xi32>
%172 = vector.extract %169[1, 0] : vector<4x1xi32>
%173 = vector.insert %172, %171 [1] : i32 into vector<4xi32>
%174 = vector.extract %169[2, 0] : vector<4x1xi32>
%175 = vector.insert %174, %173 [2] : i32 into vector<4xi32>
%176 = vector.extract %169[3, 0] : vector<4x1xi32>
%177 = vector.insert %176, %175 [3] : i32 into vector<4xi32>
%178 = vector.extract %arg9[2] : vector<4xi32>
%179 = arith.muli %32, %177 : vector<4xi32>
%180 = vector.reduction <add>, %179, %178 : vector<4xi32> into i32
%181 = vector.insert %180, %cst_0 [0] : i32 into vector<1xi32>
%182 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%183 = vector.insert %182, %cst [0] : vector<1xi8> into vector<4x1xi8>
%184 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%185 = vector.insert %184, %183 [1] : vector<1xi8> into vector<4x1xi8>
%186 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%187 = vector.insert %186, %185 [2] : vector<1xi8> into vector<4x1xi8>
%188 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%189 = vector.insert %188, %187 [3] : vector<1xi8> into vector<4x1xi8>
%190 = arith.extsi %189 : vector<4x1xi8> to vector<4x1xi32>
%191 = vector.extract %190[0, 0] : vector<4x1xi32>
%192 = vector.insert %191, %cst_1 [0] : i32 into vector<4xi32>
%193 = vector.extract %190[1, 0] : vector<4x1xi32>
%194 = vector.insert %193, %192 [1] : i32 into vector<4xi32>
%195 = vector.extract %190[2, 0] : vector<4x1xi32>
%196 = vector.insert %195, %194 [2] : i32 into vector<4xi32>
%197 = vector.extract %190[3, 0] : vector<4x1xi32>
%198 = vector.insert %197, %196 [3] : i32 into vector<4xi32>
%199 = vector.extract %arg9[3] : vector<4xi32>
%200 = arith.muli %32, %198 : vector<4xi32>
%201 = vector.reduction <add>, %200, %199 : vector<4xi32> into i32
%202 = vector.insert %201, %cst_0 [0] : i32 into vector<1xi32>
%203 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%204 = vector.insert %203, %cst [0] : vector<1xi8> into vector<4x1xi8>
%205 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%206 = vector.insert %205, %204 [1] : vector<1xi8> into vector<4x1xi8>
%207 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%208 = vector.insert %207, %206 [2] : vector<1xi8> into vector<4x1xi8>
%209 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%210 = vector.insert %209, %208 [3] : vector<1xi8> into vector<4x1xi8>
%211 = arith.extsi %210 : vector<4x1xi8> to vector<4x1xi32>
%212 = vector.extract %211[0, 0] : vector<4x1xi32>
%213 = vector.insert %212, %cst_1 [0] : i32 into vector<4xi32>
%214 = vector.extract %211[1, 0] : vector<4x1xi32>
%215 = vector.insert %214, %213 [1] : i32 into vector<4xi32>
%216 = vector.extract %211[2, 0] : vector<4x1xi32>
%217 = vector.insert %216, %215 [2] : i32 into vector<4xi32>
%218 = vector.extract %211[3, 0] : vector<4x1xi32>
%219 = vector.insert %218, %217 [3] : i32 into vector<4xi32>
%220 = vector.extract %arg8[0] : vector<4xi32>
%221 = arith.muli %33, %219 : vector<4xi32>
%222 = vector.reduction <add>, %221, %220 : vector<4xi32> into i32
%223 = vector.insert %222, %cst_0 [0] : i32 into vector<1xi32>
%224 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%225 = vector.insert %224, %cst [0] : vector<1xi8> into vector<4x1xi8>
%226 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%227 = vector.insert %226, %225 [1] : vector<1xi8> into vector<4x1xi8>
%228 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%229 = vector.insert %228, %227 [2] : vector<1xi8> into vector<4x1xi8>
%230 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%231 = vector.insert %230, %229 [3] : vector<1xi8> into vector<4x1xi8>
%232 = arith.extsi %231 : vector<4x1xi8> to vector<4x1xi32>
%233 = vector.extract %232[0, 0] : vector<4x1xi32>
%234 = vector.insert %233, %cst_1 [0] : i32 into vector<4xi32>
%235 = vector.extract %232[1, 0] : vector<4x1xi32>
%236 = vector.insert %235, %234 [1] : i32 into vector<4xi32>
%237 = vector.extract %232[2, 0] : vector<4x1xi32>
%238 = vector.insert %237, %236 [2] : i32 into vector<4xi32>
%239 = vector.extract %232[3, 0] : vector<4x1xi32>
%240 = vector.insert %239, %238 [3] : i32 into vector<4xi32>
%241 = vector.extract %arg8[1] : vector<4xi32>
%242 = arith.muli %33, %240 : vector<4xi32>
%243 = vector.reduction <add>, %242, %241 : vector<4xi32> into i32
%244 = vector.insert %243, %cst_0 [0] : i32 into vector<1xi32>
%245 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%246 = vector.insert %245, %cst [0] : vector<1xi8> into vector<4x1xi8>
%247 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%248 = vector.insert %247, %246 [1] : vector<1xi8> into vector<4x1xi8>
%249 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%250 = vector.insert %249, %248 [2] : vector<1xi8> into vector<4x1xi8>
%251 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%252 = vector.insert %251, %250 [3] : vector<1xi8> into vector<4x1xi8>
%253 = arith.extsi %252 : vector<4x1xi8> to vector<4x1xi32>
%254 = vector.extract %253[0, 0] : vector<4x1xi32>
%255 = vector.insert %254, %cst_1 [0] : i32 into vector<4xi32>
%256 = vector.extract %253[1, 0] : vector<4x1xi32>
%257 = vector.insert %256, %255 [1] : i32 into vector<4xi32>
%258 = vector.extract %253[2, 0] : vector<4x1xi32>
%259 = vector.insert %258, %257 [2] : i32 into vector<4xi32>
%260 = vector.extract %253[3, 0] : vector<4x1xi32>
%261 = vector.insert %260, %259 [3] : i32 into vector<4xi32>
%262 = vector.extract %arg8[2] : vector<4xi32>
%263 = arith.muli %33, %261 : vector<4xi32>
%264 = vector.reduction <add>, %263, %262 : vector<4xi32> into i32
%265 = vector.insert %264, %cst_0 [0] : i32 into vector<1xi32>
%266 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%267 = vector.insert %266, %cst [0] : vector<1xi8> into vector<4x1xi8>
%268 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%269 = vector.insert %268, %267 [1] : vector<1xi8> into vector<4x1xi8>
%270 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%271 = vector.insert %270, %269 [2] : vector<1xi8> into vector<4x1xi8>
%272 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%273 = vector.insert %272, %271 [3] : vector<1xi8> into vector<4x1xi8>
%274 = arith.extsi %273 : vector<4x1xi8> to vector<4x1xi32>
%275 = vector.extract %274[0, 0] : vector<4x1xi32>
%276 = vector.insert %275, %cst_1 [0] : i32 into vector<4xi32>
%277 = vector.extract %274[1, 0] : vector<4x1xi32>
%278 = vector.insert %277, %276 [1] : i32 into vector<4xi32>
%279 = vector.extract %274[2, 0] : vector<4x1xi32>
%280 = vector.insert %279, %278 [2] : i32 into vector<4xi32>
%281 = vector.extract %274[3, 0] : vector<4x1xi32>
%282 = vector.insert %281, %280 [3] : i32 into vector<4xi32>
%283 = vector.extract %arg8[3] : vector<4xi32>
%284 = arith.muli %33, %282 : vector<4xi32>
%285 = vector.reduction <add>, %284, %283 : vector<4xi32> into i32
%286 = vector.insert %285, %cst_0 [0] : i32 into vector<1xi32>
%287 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%288 = vector.insert %287, %cst [0] : vector<1xi8> into vector<4x1xi8>
%289 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%290 = vector.insert %289, %288 [1] : vector<1xi8> into vector<4x1xi8>
%291 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%292 = vector.insert %291, %290 [2] : vector<1xi8> into vector<4x1xi8>
%293 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%294 = vector.insert %293, %292 [3] : vector<1xi8> into vector<4x1xi8>
%295 = arith.extsi %294 : vector<4x1xi8> to vector<4x1xi32>
%296 = vector.extract %295[0, 0] : vector<4x1xi32>
%297 = vector.insert %296, %cst_1 [0] : i32 into vector<4xi32>
%298 = vector.extract %295[1, 0] : vector<4x1xi32>
%299 = vector.insert %298, %297 [1] : i32 into vector<4xi32>
%300 = vector.extract %295[2, 0] : vector<4x1xi32>
%301 = vector.insert %300, %299 [2] : i32 into vector<4xi32>
%302 = vector.extract %295[3, 0] : vector<4x1xi32>
%303 = vector.insert %302, %301 [3] : i32 into vector<4xi32>
%304 = vector.extract %arg7[0] : vector<4xi32>
%305 = arith.muli %34, %303 : vector<4xi32>
%306 = vector.reduction <add>, %305, %304 : vector<4xi32> into i32
%307 = vector.insert %306, %cst_0 [0] : i32 into vector<1xi32>
%308 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%309 = vector.insert %308, %cst [0] : vector<1xi8> into vector<4x1xi8>
%310 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%311 = vector.insert %310, %309 [1] : vector<1xi8> into vector<4x1xi8>
%312 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%313 = vector.insert %312, %311 [2] : vector<1xi8> into vector<4x1xi8>
%314 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%315 = vector.insert %314, %313 [3] : vector<1xi8> into vector<4x1xi8>
%316 = arith.extsi %315 : vector<4x1xi8> to vector<4x1xi32>
%317 = vector.extract %316[0, 0] : vector<4x1xi32>
%318 = vector.insert %317, %cst_1 [0] : i32 into vector<4xi32>
%319 = vector.extract %316[1, 0] : vector<4x1xi32>
%320 = vector.insert %319, %318 [1] : i32 into vector<4xi32>
%321 = vector.extract %316[2, 0] : vector<4x1xi32>
%322 = vector.insert %321, %320 [2] : i32 into vector<4xi32>
%323 = vector.extract %316[3, 0] : vector<4x1xi32>
%324 = vector.insert %323, %322 [3] : i32 into vector<4xi32>
%325 = vector.extract %arg7[1] : vector<4xi32>
%326 = arith.muli %34, %324 : vector<4xi32>
%327 = vector.reduction <add>, %326, %325 : vector<4xi32> into i32
%328 = vector.insert %327, %cst_0 [0] : i32 into vector<1xi32>
%329 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%330 = vector.insert %329, %cst [0] : vector<1xi8> into vector<4x1xi8>
%331 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%332 = vector.insert %331, %330 [1] : vector<1xi8> into vector<4x1xi8>
%333 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%334 = vector.insert %333, %332 [2] : vector<1xi8> into vector<4x1xi8>
%335 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%336 = vector.insert %335, %334 [3] : vector<1xi8> into vector<4x1xi8>
%337 = arith.extsi %336 : vector<4x1xi8> to vector<4x1xi32>
%338 = vector.extract %337[0, 0] : vector<4x1xi32>
%339 = vector.insert %338, %cst_1 [0] : i32 into vector<4xi32>
%340 = vector.extract %337[1, 0] : vector<4x1xi32>
%341 = vector.insert %340, %339 [1] : i32 into vector<4xi32>
%342 = vector.extract %337[2, 0] : vector<4x1xi32>
%343 = vector.insert %342, %341 [2] : i32 into vector<4xi32>
%344 = vector.extract %337[3, 0] : vector<4x1xi32>
%345 = vector.insert %344, %343 [3] : i32 into vector<4xi32>
%346 = vector.extract %arg7[2] : vector<4xi32>
%347 = arith.muli %34, %345 : vector<4xi32>
%348 = vector.reduction <add>, %347, %346 : vector<4xi32> into i32
%349 = vector.insert %348, %cst_0 [0] : i32 into vector<1xi32>
%350 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%351 = vector.insert %350, %cst [0] : vector<1xi8> into vector<4x1xi8>
%352 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%353 = vector.insert %352, %351 [1] : vector<1xi8> into vector<4x1xi8>
%354 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%355 = vector.insert %354, %353 [2] : vector<1xi8> into vector<4x1xi8>
%356 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%357 = vector.insert %356, %355 [3] : vector<1xi8> into vector<4x1xi8>
%358 = arith.extsi %357 : vector<4x1xi8> to vector<4x1xi32>
%359 = vector.extract %358[0, 0] : vector<4x1xi32>
%360 = vector.insert %359, %cst_1 [0] : i32 into vector<4xi32>
%361 = vector.extract %358[1, 0] : vector<4x1xi32>
%362 = vector.insert %361, %360 [1] : i32 into vector<4xi32>
%363 = vector.extract %358[2, 0] : vector<4x1xi32>
%364 = vector.insert %363, %362 [2] : i32 into vector<4xi32>
%365 = vector.extract %358[3, 0] : vector<4x1xi32>
%366 = vector.insert %365, %364 [3] : i32 into vector<4xi32>
%367 = vector.extract %arg7[3] : vector<4xi32>
%368 = arith.muli %34, %366 : vector<4xi32>
%369 = vector.reduction <add>, %368, %367 : vector<4xi32> into i32
%370 = vector.insert %369, %cst_0 [0] : i32 into vector<1xi32>
%371 = vector.insert_strided_slice %55, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%372 = vector.insert_strided_slice %76, %371 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%373 = vector.insert_strided_slice %97, %372 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%374 = vector.insert_strided_slice %118, %373 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%375 = vector.insert_strided_slice %139, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%376 = vector.insert_strided_slice %160, %375 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%377 = vector.insert_strided_slice %181, %376 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%378 = vector.insert_strided_slice %202, %377 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%379 = vector.insert_strided_slice %223, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%380 = vector.insert_strided_slice %244, %379 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%381 = vector.insert_strided_slice %265, %380 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%382 = vector.insert_strided_slice %286, %381 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%383 = vector.insert_strided_slice %307, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%384 = vector.insert_strided_slice %328, %383 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%385 = vector.insert_strided_slice %349, %384 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%386 = vector.insert_strided_slice %370, %385 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
scf.yield %386, %382, %378, %374 : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>
}
%16 = vector.transfer_write %15#3, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%17 = vector.transfer_write %15#2, %16[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%18 = vector.transfer_write %15#1, %17[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%19 = vector.transfer_write %15#0, %18[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%inserted_slice = tensor.insert_slice %19 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %14 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
// -----// IR Dump After ForOpCanonicalization (iree-codegen-canonicalize-scf-for) //----- //
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%cst = arith.constant dense<0> : vector<4x1xi8>
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<4xi32>
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c0_i8 = arith.constant 0 : i8
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c1024 step %4 {
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
scf.for %arg1 = %5 to %c1024 step %6 {
%8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %8) -> (tensor<8x32xi32>) {
%11 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
%12 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2)
%14 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%15:4 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %cst_1, %arg8 = %cst_1, %arg9 = %cst_1, %arg10 = %cst_1) -> (vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>) {
%20 = vector.transfer_read %7[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%21 = vector.transfer_read %7[%11, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%22 = vector.transfer_read %7[%12, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%23 = vector.transfer_read %7[%13, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%24 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%25 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6)
%26 = vector.transfer_read %9[%25, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%27 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6)
%28 = vector.transfer_read %9[%27, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%29 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6)
%30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%31 = arith.extsi %20 : vector<4xi8> to vector<4xi32>
%32 = arith.extsi %21 : vector<4xi8> to vector<4xi32>
%33 = arith.extsi %22 : vector<4xi8> to vector<4xi32>
%34 = arith.extsi %23 : vector<4xi8> to vector<4xi32>
%35 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%36 = vector.insert %35, %cst [0] : vector<1xi8> into vector<4x1xi8>
%37 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%38 = vector.insert %37, %36 [1] : vector<1xi8> into vector<4x1xi8>
%39 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%40 = vector.insert %39, %38 [2] : vector<1xi8> into vector<4x1xi8>
%41 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%42 = vector.insert %41, %40 [3] : vector<1xi8> into vector<4x1xi8>
%43 = arith.extsi %42 : vector<4x1xi8> to vector<4x1xi32>
%44 = vector.extract %43[0, 0] : vector<4x1xi32>
%45 = vector.insert %44, %cst_1 [0] : i32 into vector<4xi32>
%46 = vector.extract %43[1, 0] : vector<4x1xi32>
%47 = vector.insert %46, %45 [1] : i32 into vector<4xi32>
%48 = vector.extract %43[2, 0] : vector<4x1xi32>
%49 = vector.insert %48, %47 [2] : i32 into vector<4xi32>
%50 = vector.extract %43[3, 0] : vector<4x1xi32>
%51 = vector.insert %50, %49 [3] : i32 into vector<4xi32>
%52 = vector.extract %arg10[0] : vector<4xi32>
%53 = arith.muli %31, %51 : vector<4xi32>
%54 = vector.reduction <add>, %53, %52 : vector<4xi32> into i32
%55 = vector.insert %54, %cst_0 [0] : i32 into vector<1xi32>
%56 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%57 = vector.insert %56, %cst [0] : vector<1xi8> into vector<4x1xi8>
%58 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%59 = vector.insert %58, %57 [1] : vector<1xi8> into vector<4x1xi8>
%60 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%61 = vector.insert %60, %59 [2] : vector<1xi8> into vector<4x1xi8>
%62 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%63 = vector.insert %62, %61 [3] : vector<1xi8> into vector<4x1xi8>
%64 = arith.extsi %63 : vector<4x1xi8> to vector<4x1xi32>
%65 = vector.extract %64[0, 0] : vector<4x1xi32>
%66 = vector.insert %65, %cst_1 [0] : i32 into vector<4xi32>
%67 = vector.extract %64[1, 0] : vector<4x1xi32>
%68 = vector.insert %67, %66 [1] : i32 into vector<4xi32>
%69 = vector.extract %64[2, 0] : vector<4x1xi32>
%70 = vector.insert %69, %68 [2] : i32 into vector<4xi32>
%71 = vector.extract %64[3, 0] : vector<4x1xi32>
%72 = vector.insert %71, %70 [3] : i32 into vector<4xi32>
%73 = vector.extract %arg10[1] : vector<4xi32>
%74 = arith.muli %31, %72 : vector<4xi32>
%75 = vector.reduction <add>, %74, %73 : vector<4xi32> into i32
%76 = vector.insert %75, %cst_0 [0] : i32 into vector<1xi32>
%77 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%78 = vector.insert %77, %cst [0] : vector<1xi8> into vector<4x1xi8>
%79 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%80 = vector.insert %79, %78 [1] : vector<1xi8> into vector<4x1xi8>
%81 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%82 = vector.insert %81, %80 [2] : vector<1xi8> into vector<4x1xi8>
%83 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%84 = vector.insert %83, %82 [3] : vector<1xi8> into vector<4x1xi8>
%85 = arith.extsi %84 : vector<4x1xi8> to vector<4x1xi32>
%86 = vector.extract %85[0, 0] : vector<4x1xi32>
%87 = vector.insert %86, %cst_1 [0] : i32 into vector<4xi32>
%88 = vector.extract %85[1, 0] : vector<4x1xi32>
%89 = vector.insert %88, %87 [1] : i32 into vector<4xi32>
%90 = vector.extract %85[2, 0] : vector<4x1xi32>
%91 = vector.insert %90, %89 [2] : i32 into vector<4xi32>
%92 = vector.extract %85[3, 0] : vector<4x1xi32>
%93 = vector.insert %92, %91 [3] : i32 into vector<4xi32>
%94 = vector.extract %arg10[2] : vector<4xi32>
%95 = arith.muli %31, %93 : vector<4xi32>
%96 = vector.reduction <add>, %95, %94 : vector<4xi32> into i32
%97 = vector.insert %96, %cst_0 [0] : i32 into vector<1xi32>
%98 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%99 = vector.insert %98, %cst [0] : vector<1xi8> into vector<4x1xi8>
%100 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%101 = vector.insert %100, %99 [1] : vector<1xi8> into vector<4x1xi8>
%102 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%103 = vector.insert %102, %101 [2] : vector<1xi8> into vector<4x1xi8>
%104 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%105 = vector.insert %104, %103 [3] : vector<1xi8> into vector<4x1xi8>
%106 = arith.extsi %105 : vector<4x1xi8> to vector<4x1xi32>
%107 = vector.extract %106[0, 0] : vector<4x1xi32>
%108 = vector.insert %107, %cst_1 [0] : i32 into vector<4xi32>
%109 = vector.extract %106[1, 0] : vector<4x1xi32>
%110 = vector.insert %109, %108 [1] : i32 into vector<4xi32>
%111 = vector.extract %106[2, 0] : vector<4x1xi32>
%112 = vector.insert %111, %110 [2] : i32 into vector<4xi32>
%113 = vector.extract %106[3, 0] : vector<4x1xi32>
%114 = vector.insert %113, %112 [3] : i32 into vector<4xi32>
%115 = vector.extract %arg10[3] : vector<4xi32>
%116 = arith.muli %31, %114 : vector<4xi32>
%117 = vector.reduction <add>, %116, %115 : vector<4xi32> into i32
%118 = vector.insert %117, %cst_0 [0] : i32 into vector<1xi32>
%119 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%120 = vector.insert %119, %cst [0] : vector<1xi8> into vector<4x1xi8>
%121 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%122 = vector.insert %121, %120 [1] : vector<1xi8> into vector<4x1xi8>
%123 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%124 = vector.insert %123, %122 [2] : vector<1xi8> into vector<4x1xi8>
%125 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%126 = vector.insert %125, %124 [3] : vector<1xi8> into vector<4x1xi8>
%127 = arith.extsi %126 : vector<4x1xi8> to vector<4x1xi32>
%128 = vector.extract %127[0, 0] : vector<4x1xi32>
%129 = vector.insert %128, %cst_1 [0] : i32 into vector<4xi32>
%130 = vector.extract %127[1, 0] : vector<4x1xi32>
%131 = vector.insert %130, %129 [1] : i32 into vector<4xi32>
%132 = vector.extract %127[2, 0] : vector<4x1xi32>
%133 = vector.insert %132, %131 [2] : i32 into vector<4xi32>
%134 = vector.extract %127[3, 0] : vector<4x1xi32>
%135 = vector.insert %134, %133 [3] : i32 into vector<4xi32>
%136 = vector.extract %arg9[0] : vector<4xi32>
%137 = arith.muli %32, %135 : vector<4xi32>
%138 = vector.reduction <add>, %137, %136 : vector<4xi32> into i32
%139 = vector.insert %138, %cst_0 [0] : i32 into vector<1xi32>
%140 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%141 = vector.insert %140, %cst [0] : vector<1xi8> into vector<4x1xi8>
%142 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%143 = vector.insert %142, %141 [1] : vector<1xi8> into vector<4x1xi8>
%144 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%145 = vector.insert %144, %143 [2] : vector<1xi8> into vector<4x1xi8>
%146 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%147 = vector.insert %146, %145 [3] : vector<1xi8> into vector<4x1xi8>
%148 = arith.extsi %147 : vector<4x1xi8> to vector<4x1xi32>
%149 = vector.extract %148[0, 0] : vector<4x1xi32>
%150 = vector.insert %149, %cst_1 [0] : i32 into vector<4xi32>
%151 = vector.extract %148[1, 0] : vector<4x1xi32>
%152 = vector.insert %151, %150 [1] : i32 into vector<4xi32>
%153 = vector.extract %148[2, 0] : vector<4x1xi32>
%154 = vector.insert %153, %152 [2] : i32 into vector<4xi32>
%155 = vector.extract %148[3, 0] : vector<4x1xi32>
%156 = vector.insert %155, %154 [3] : i32 into vector<4xi32>
%157 = vector.extract %arg9[1] : vector<4xi32>
%158 = arith.muli %32, %156 : vector<4xi32>
%159 = vector.reduction <add>, %158, %157 : vector<4xi32> into i32
%160 = vector.insert %159, %cst_0 [0] : i32 into vector<1xi32>
%161 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%162 = vector.insert %161, %cst [0] : vector<1xi8> into vector<4x1xi8>
%163 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%164 = vector.insert %163, %162 [1] : vector<1xi8> into vector<4x1xi8>
%165 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%166 = vector.insert %165, %164 [2] : vector<1xi8> into vector<4x1xi8>
%167 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%168 = vector.insert %167, %166 [3] : vector<1xi8> into vector<4x1xi8>
%169 = arith.extsi %168 : vector<4x1xi8> to vector<4x1xi32>
%170 = vector.extract %169[0, 0] : vector<4x1xi32>
%171 = vector.insert %170, %cst_1 [0] : i32 into vector<4xi32>
%172 = vector.extract %169[1, 0] : vector<4x1xi32>
%173 = vector.insert %172, %171 [1] : i32 into vector<4xi32>
%174 = vector.extract %169[2, 0] : vector<4x1xi32>
%175 = vector.insert %174, %173 [2] : i32 into vector<4xi32>
%176 = vector.extract %169[3, 0] : vector<4x1xi32>
%177 = vector.insert %176, %175 [3] : i32 into vector<4xi32>
%178 = vector.extract %arg9[2] : vector<4xi32>
%179 = arith.muli %32, %177 : vector<4xi32>
%180 = vector.reduction <add>, %179, %178 : vector<4xi32> into i32
%181 = vector.insert %180, %cst_0 [0] : i32 into vector<1xi32>
%182 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%183 = vector.insert %182, %cst [0] : vector<1xi8> into vector<4x1xi8>
%184 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%185 = vector.insert %184, %183 [1] : vector<1xi8> into vector<4x1xi8>
%186 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%187 = vector.insert %186, %185 [2] : vector<1xi8> into vector<4x1xi8>
%188 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%189 = vector.insert %188, %187 [3] : vector<1xi8> into vector<4x1xi8>
%190 = arith.extsi %189 : vector<4x1xi8> to vector<4x1xi32>
%191 = vector.extract %190[0, 0] : vector<4x1xi32>
%192 = vector.insert %191, %cst_1 [0] : i32 into vector<4xi32>
%193 = vector.extract %190[1, 0] : vector<4x1xi32>
%194 = vector.insert %193, %192 [1] : i32 into vector<4xi32>
%195 = vector.extract %190[2, 0] : vector<4x1xi32>
%196 = vector.insert %195, %194 [2] : i32 into vector<4xi32>
%197 = vector.extract %190[3, 0] : vector<4x1xi32>
%198 = vector.insert %197, %196 [3] : i32 into vector<4xi32>
%199 = vector.extract %arg9[3] : vector<4xi32>
%200 = arith.muli %32, %198 : vector<4xi32>
%201 = vector.reduction <add>, %200, %199 : vector<4xi32> into i32
%202 = vector.insert %201, %cst_0 [0] : i32 into vector<1xi32>
%203 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%204 = vector.insert %203, %cst [0] : vector<1xi8> into vector<4x1xi8>
%205 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%206 = vector.insert %205, %204 [1] : vector<1xi8> into vector<4x1xi8>
%207 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%208 = vector.insert %207, %206 [2] : vector<1xi8> into vector<4x1xi8>
%209 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%210 = vector.insert %209, %208 [3] : vector<1xi8> into vector<4x1xi8>
%211 = arith.extsi %210 : vector<4x1xi8> to vector<4x1xi32>
%212 = vector.extract %211[0, 0] : vector<4x1xi32>
%213 = vector.insert %212, %cst_1 [0] : i32 into vector<4xi32>
%214 = vector.extract %211[1, 0] : vector<4x1xi32>
%215 = vector.insert %214, %213 [1] : i32 into vector<4xi32>
%216 = vector.extract %211[2, 0] : vector<4x1xi32>
%217 = vector.insert %216, %215 [2] : i32 into vector<4xi32>
%218 = vector.extract %211[3, 0] : vector<4x1xi32>
%219 = vector.insert %218, %217 [3] : i32 into vector<4xi32>
%220 = vector.extract %arg8[0] : vector<4xi32>
%221 = arith.muli %33, %219 : vector<4xi32>
%222 = vector.reduction <add>, %221, %220 : vector<4xi32> into i32
%223 = vector.insert %222, %cst_0 [0] : i32 into vector<1xi32>
%224 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%225 = vector.insert %224, %cst [0] : vector<1xi8> into vector<4x1xi8>
%226 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%227 = vector.insert %226, %225 [1] : vector<1xi8> into vector<4x1xi8>
%228 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%229 = vector.insert %228, %227 [2] : vector<1xi8> into vector<4x1xi8>
%230 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%231 = vector.insert %230, %229 [3] : vector<1xi8> into vector<4x1xi8>
%232 = arith.extsi %231 : vector<4x1xi8> to vector<4x1xi32>
%233 = vector.extract %232[0, 0] : vector<4x1xi32>
%234 = vector.insert %233, %cst_1 [0] : i32 into vector<4xi32>
%235 = vector.extract %232[1, 0] : vector<4x1xi32>
%236 = vector.insert %235, %234 [1] : i32 into vector<4xi32>
%237 = vector.extract %232[2, 0] : vector<4x1xi32>
%238 = vector.insert %237, %236 [2] : i32 into vector<4xi32>
%239 = vector.extract %232[3, 0] : vector<4x1xi32>
%240 = vector.insert %239, %238 [3] : i32 into vector<4xi32>
%241 = vector.extract %arg8[1] : vector<4xi32>
%242 = arith.muli %33, %240 : vector<4xi32>
%243 = vector.reduction <add>, %242, %241 : vector<4xi32> into i32
%244 = vector.insert %243, %cst_0 [0] : i32 into vector<1xi32>
%245 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%246 = vector.insert %245, %cst [0] : vector<1xi8> into vector<4x1xi8>
%247 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%248 = vector.insert %247, %246 [1] : vector<1xi8> into vector<4x1xi8>
%249 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%250 = vector.insert %249, %248 [2] : vector<1xi8> into vector<4x1xi8>
%251 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%252 = vector.insert %251, %250 [3] : vector<1xi8> into vector<4x1xi8>
%253 = arith.extsi %252 : vector<4x1xi8> to vector<4x1xi32>
%254 = vector.extract %253[0, 0] : vector<4x1xi32>
%255 = vector.insert %254, %cst_1 [0] : i32 into vector<4xi32>
%256 = vector.extract %253[1, 0] : vector<4x1xi32>
%257 = vector.insert %256, %255 [1] : i32 into vector<4xi32>
%258 = vector.extract %253[2, 0] : vector<4x1xi32>
%259 = vector.insert %258, %257 [2] : i32 into vector<4xi32>
%260 = vector.extract %253[3, 0] : vector<4x1xi32>
%261 = vector.insert %260, %259 [3] : i32 into vector<4xi32>
%262 = vector.extract %arg8[2] : vector<4xi32>
%263 = arith.muli %33, %261 : vector<4xi32>
%264 = vector.reduction <add>, %263, %262 : vector<4xi32> into i32
%265 = vector.insert %264, %cst_0 [0] : i32 into vector<1xi32>
%266 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%267 = vector.insert %266, %cst [0] : vector<1xi8> into vector<4x1xi8>
%268 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%269 = vector.insert %268, %267 [1] : vector<1xi8> into vector<4x1xi8>
%270 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%271 = vector.insert %270, %269 [2] : vector<1xi8> into vector<4x1xi8>
%272 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%273 = vector.insert %272, %271 [3] : vector<1xi8> into vector<4x1xi8>
%274 = arith.extsi %273 : vector<4x1xi8> to vector<4x1xi32>
%275 = vector.extract %274[0, 0] : vector<4x1xi32>
%276 = vector.insert %275, %cst_1 [0] : i32 into vector<4xi32>
%277 = vector.extract %274[1, 0] : vector<4x1xi32>
%278 = vector.insert %277, %276 [1] : i32 into vector<4xi32>
%279 = vector.extract %274[2, 0] : vector<4x1xi32>
%280 = vector.insert %279, %278 [2] : i32 into vector<4xi32>
%281 = vector.extract %274[3, 0] : vector<4x1xi32>
%282 = vector.insert %281, %280 [3] : i32 into vector<4xi32>
%283 = vector.extract %arg8[3] : vector<4xi32>
%284 = arith.muli %33, %282 : vector<4xi32>
%285 = vector.reduction <add>, %284, %283 : vector<4xi32> into i32
%286 = vector.insert %285, %cst_0 [0] : i32 into vector<1xi32>
%287 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%288 = vector.insert %287, %cst [0] : vector<1xi8> into vector<4x1xi8>
%289 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%290 = vector.insert %289, %288 [1] : vector<1xi8> into vector<4x1xi8>
%291 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%292 = vector.insert %291, %290 [2] : vector<1xi8> into vector<4x1xi8>
%293 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%294 = vector.insert %293, %292 [3] : vector<1xi8> into vector<4x1xi8>
%295 = arith.extsi %294 : vector<4x1xi8> to vector<4x1xi32>
%296 = vector.extract %295[0, 0] : vector<4x1xi32>
%297 = vector.insert %296, %cst_1 [0] : i32 into vector<4xi32>
%298 = vector.extract %295[1, 0] : vector<4x1xi32>
%299 = vector.insert %298, %297 [1] : i32 into vector<4xi32>
%300 = vector.extract %295[2, 0] : vector<4x1xi32>
%301 = vector.insert %300, %299 [2] : i32 into vector<4xi32>
%302 = vector.extract %295[3, 0] : vector<4x1xi32>
%303 = vector.insert %302, %301 [3] : i32 into vector<4xi32>
%304 = vector.extract %arg7[0] : vector<4xi32>
%305 = arith.muli %34, %303 : vector<4xi32>
%306 = vector.reduction <add>, %305, %304 : vector<4xi32> into i32
%307 = vector.insert %306, %cst_0 [0] : i32 into vector<1xi32>
%308 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%309 = vector.insert %308, %cst [0] : vector<1xi8> into vector<4x1xi8>
%310 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%311 = vector.insert %310, %309 [1] : vector<1xi8> into vector<4x1xi8>
%312 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%313 = vector.insert %312, %311 [2] : vector<1xi8> into vector<4x1xi8>
%314 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%315 = vector.insert %314, %313 [3] : vector<1xi8> into vector<4x1xi8>
%316 = arith.extsi %315 : vector<4x1xi8> to vector<4x1xi32>
%317 = vector.extract %316[0, 0] : vector<4x1xi32>
%318 = vector.insert %317, %cst_1 [0] : i32 into vector<4xi32>
%319 = vector.extract %316[1, 0] : vector<4x1xi32>
%320 = vector.insert %319, %318 [1] : i32 into vector<4xi32>
%321 = vector.extract %316[2, 0] : vector<4x1xi32>
%322 = vector.insert %321, %320 [2] : i32 into vector<4xi32>
%323 = vector.extract %316[3, 0] : vector<4x1xi32>
%324 = vector.insert %323, %322 [3] : i32 into vector<4xi32>
%325 = vector.extract %arg7[1] : vector<4xi32>
%326 = arith.muli %34, %324 : vector<4xi32>
%327 = vector.reduction <add>, %326, %325 : vector<4xi32> into i32
%328 = vector.insert %327, %cst_0 [0] : i32 into vector<1xi32>
%329 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%330 = vector.insert %329, %cst [0] : vector<1xi8> into vector<4x1xi8>
%331 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%332 = vector.insert %331, %330 [1] : vector<1xi8> into vector<4x1xi8>
%333 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%334 = vector.insert %333, %332 [2] : vector<1xi8> into vector<4x1xi8>
%335 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%336 = vector.insert %335, %334 [3] : vector<1xi8> into vector<4x1xi8>
%337 = arith.extsi %336 : vector<4x1xi8> to vector<4x1xi32>
%338 = vector.extract %337[0, 0] : vector<4x1xi32>
%339 = vector.insert %338, %cst_1 [0] : i32 into vector<4xi32>
%340 = vector.extract %337[1, 0] : vector<4x1xi32>
%341 = vector.insert %340, %339 [1] : i32 into vector<4xi32>
%342 = vector.extract %337[2, 0] : vector<4x1xi32>
%343 = vector.insert %342, %341 [2] : i32 into vector<4xi32>
%344 = vector.extract %337[3, 0] : vector<4x1xi32>
%345 = vector.insert %344, %343 [3] : i32 into vector<4xi32>
%346 = vector.extract %arg7[2] : vector<4xi32>
%347 = arith.muli %34, %345 : vector<4xi32>
%348 = vector.reduction <add>, %347, %346 : vector<4xi32> into i32
%349 = vector.insert %348, %cst_0 [0] : i32 into vector<1xi32>
%350 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%351 = vector.insert %350, %cst [0] : vector<1xi8> into vector<4x1xi8>
%352 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%353 = vector.insert %352, %351 [1] : vector<1xi8> into vector<4x1xi8>
%354 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%355 = vector.insert %354, %353 [2] : vector<1xi8> into vector<4x1xi8>
%356 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%357 = vector.insert %356, %355 [3] : vector<1xi8> into vector<4x1xi8>
%358 = arith.extsi %357 : vector<4x1xi8> to vector<4x1xi32>
%359 = vector.extract %358[0, 0] : vector<4x1xi32>
%360 = vector.insert %359, %cst_1 [0] : i32 into vector<4xi32>
%361 = vector.extract %358[1, 0] : vector<4x1xi32>
%362 = vector.insert %361, %360 [1] : i32 into vector<4xi32>
%363 = vector.extract %358[2, 0] : vector<4x1xi32>
%364 = vector.insert %363, %362 [2] : i32 into vector<4xi32>
%365 = vector.extract %358[3, 0] : vector<4x1xi32>
%366 = vector.insert %365, %364 [3] : i32 into vector<4xi32>
%367 = vector.extract %arg7[3] : vector<4xi32>
%368 = arith.muli %34, %366 : vector<4xi32>
%369 = vector.reduction <add>, %368, %367 : vector<4xi32> into i32
%370 = vector.insert %369, %cst_0 [0] : i32 into vector<1xi32>
%371 = vector.insert_strided_slice %55, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%372 = vector.insert_strided_slice %76, %371 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%373 = vector.insert_strided_slice %97, %372 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%374 = vector.insert_strided_slice %118, %373 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%375 = vector.insert_strided_slice %139, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%376 = vector.insert_strided_slice %160, %375 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%377 = vector.insert_strided_slice %181, %376 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%378 = vector.insert_strided_slice %202, %377 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%379 = vector.insert_strided_slice %223, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%380 = vector.insert_strided_slice %244, %379 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%381 = vector.insert_strided_slice %265, %380 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%382 = vector.insert_strided_slice %286, %381 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%383 = vector.insert_strided_slice %307, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%384 = vector.insert_strided_slice %328, %383 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%385 = vector.insert_strided_slice %349, %384 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%386 = vector.insert_strided_slice %370, %385 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
scf.yield %386, %382, %378, %374 : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>
}
%16 = vector.transfer_write %15#3, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%17 = vector.transfer_write %15#2, %16[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%18 = vector.transfer_write %15#1, %17[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%19 = vector.transfer_write %15#0, %18[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%inserted_slice = tensor.insert_slice %19 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %14 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%cst = arith.constant dense<0> : vector<4x1xi8>
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<4xi32>
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c0_i8 = arith.constant 0 : i8
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c1024 step %4 {
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
scf.for %arg1 = %5 to %c1024 step %6 {
%8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %8) -> (tensor<8x32xi32>) {
%11 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
%12 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2)
%14 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%15:4 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %cst_1, %arg8 = %cst_1, %arg9 = %cst_1, %arg10 = %cst_1) -> (vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>) {
%20 = vector.transfer_read %7[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%21 = vector.transfer_read %7[%11, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%22 = vector.transfer_read %7[%12, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%23 = vector.transfer_read %7[%13, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%24 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%25 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6)
%26 = vector.transfer_read %9[%25, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%27 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6)
%28 = vector.transfer_read %9[%27, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%29 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6)
%30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%31 = arith.extsi %20 : vector<4xi8> to vector<4xi32>
%32 = arith.extsi %21 : vector<4xi8> to vector<4xi32>
%33 = arith.extsi %22 : vector<4xi8> to vector<4xi32>
%34 = arith.extsi %23 : vector<4xi8> to vector<4xi32>
%35 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%36 = vector.insert %35, %cst [0] : vector<1xi8> into vector<4x1xi8>
%37 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%38 = vector.insert %37, %36 [1] : vector<1xi8> into vector<4x1xi8>
%39 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%40 = vector.insert %39, %38 [2] : vector<1xi8> into vector<4x1xi8>
%41 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%42 = vector.insert %41, %40 [3] : vector<1xi8> into vector<4x1xi8>
%43 = arith.extsi %42 : vector<4x1xi8> to vector<4x1xi32>
%44 = vector.extract %43[0, 0] : vector<4x1xi32>
%45 = vector.insert %44, %cst_1 [0] : i32 into vector<4xi32>
%46 = vector.extract %43[1, 0] : vector<4x1xi32>
%47 = vector.insert %46, %45 [1] : i32 into vector<4xi32>
%48 = vector.extract %43[2, 0] : vector<4x1xi32>
%49 = vector.insert %48, %47 [2] : i32 into vector<4xi32>
%50 = vector.extract %43[3, 0] : vector<4x1xi32>
%51 = vector.insert %50, %49 [3] : i32 into vector<4xi32>
%52 = vector.extract %arg10[0] : vector<4xi32>
%53 = arith.muli %31, %51 : vector<4xi32>
%54 = vector.reduction <add>, %53, %52 : vector<4xi32> into i32
%55 = vector.insert %54, %cst_0 [0] : i32 into vector<1xi32>
%56 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%57 = vector.insert %56, %cst [0] : vector<1xi8> into vector<4x1xi8>
%58 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%59 = vector.insert %58, %57 [1] : vector<1xi8> into vector<4x1xi8>
%60 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%61 = vector.insert %60, %59 [2] : vector<1xi8> into vector<4x1xi8>
%62 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%63 = vector.insert %62, %61 [3] : vector<1xi8> into vector<4x1xi8>
%64 = arith.extsi %63 : vector<4x1xi8> to vector<4x1xi32>
%65 = vector.extract %64[0, 0] : vector<4x1xi32>
%66 = vector.insert %65, %cst_1 [0] : i32 into vector<4xi32>
%67 = vector.extract %64[1, 0] : vector<4x1xi32>
%68 = vector.insert %67, %66 [1] : i32 into vector<4xi32>
%69 = vector.extract %64[2, 0] : vector<4x1xi32>
%70 = vector.insert %69, %68 [2] : i32 into vector<4xi32>
%71 = vector.extract %64[3, 0] : vector<4x1xi32>
%72 = vector.insert %71, %70 [3] : i32 into vector<4xi32>
%73 = vector.extract %arg10[1] : vector<4xi32>
%74 = arith.muli %31, %72 : vector<4xi32>
%75 = vector.reduction <add>, %74, %73 : vector<4xi32> into i32
%76 = vector.insert %75, %cst_0 [0] : i32 into vector<1xi32>
%77 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%78 = vector.insert %77, %cst [0] : vector<1xi8> into vector<4x1xi8>
%79 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%80 = vector.insert %79, %78 [1] : vector<1xi8> into vector<4x1xi8>
%81 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%82 = vector.insert %81, %80 [2] : vector<1xi8> into vector<4x1xi8>
%83 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%84 = vector.insert %83, %82 [3] : vector<1xi8> into vector<4x1xi8>
%85 = arith.extsi %84 : vector<4x1xi8> to vector<4x1xi32>
%86 = vector.extract %85[0, 0] : vector<4x1xi32>
%87 = vector.insert %86, %cst_1 [0] : i32 into vector<4xi32>
%88 = vector.extract %85[1, 0] : vector<4x1xi32>
%89 = vector.insert %88, %87 [1] : i32 into vector<4xi32>
%90 = vector.extract %85[2, 0] : vector<4x1xi32>
%91 = vector.insert %90, %89 [2] : i32 into vector<4xi32>
%92 = vector.extract %85[3, 0] : vector<4x1xi32>
%93 = vector.insert %92, %91 [3] : i32 into vector<4xi32>
%94 = vector.extract %arg10[2] : vector<4xi32>
%95 = arith.muli %31, %93 : vector<4xi32>
%96 = vector.reduction <add>, %95, %94 : vector<4xi32> into i32
%97 = vector.insert %96, %cst_0 [0] : i32 into vector<1xi32>
%98 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%99 = vector.insert %98, %cst [0] : vector<1xi8> into vector<4x1xi8>
%100 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%101 = vector.insert %100, %99 [1] : vector<1xi8> into vector<4x1xi8>
%102 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%103 = vector.insert %102, %101 [2] : vector<1xi8> into vector<4x1xi8>
%104 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%105 = vector.insert %104, %103 [3] : vector<1xi8> into vector<4x1xi8>
%106 = arith.extsi %105 : vector<4x1xi8> to vector<4x1xi32>
%107 = vector.extract %106[0, 0] : vector<4x1xi32>
%108 = vector.insert %107, %cst_1 [0] : i32 into vector<4xi32>
%109 = vector.extract %106[1, 0] : vector<4x1xi32>
%110 = vector.insert %109, %108 [1] : i32 into vector<4xi32>
%111 = vector.extract %106[2, 0] : vector<4x1xi32>
%112 = vector.insert %111, %110 [2] : i32 into vector<4xi32>
%113 = vector.extract %106[3, 0] : vector<4x1xi32>
%114 = vector.insert %113, %112 [3] : i32 into vector<4xi32>
%115 = vector.extract %arg10[3] : vector<4xi32>
%116 = arith.muli %31, %114 : vector<4xi32>
%117 = vector.reduction <add>, %116, %115 : vector<4xi32> into i32
%118 = vector.insert %117, %cst_0 [0] : i32 into vector<1xi32>
%119 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%120 = vector.insert %119, %cst [0] : vector<1xi8> into vector<4x1xi8>
%121 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%122 = vector.insert %121, %120 [1] : vector<1xi8> into vector<4x1xi8>
%123 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%124 = vector.insert %123, %122 [2] : vector<1xi8> into vector<4x1xi8>
%125 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%126 = vector.insert %125, %124 [3] : vector<1xi8> into vector<4x1xi8>
%127 = arith.extsi %126 : vector<4x1xi8> to vector<4x1xi32>
%128 = vector.extract %127[0, 0] : vector<4x1xi32>
%129 = vector.insert %128, %cst_1 [0] : i32 into vector<4xi32>
%130 = vector.extract %127[1, 0] : vector<4x1xi32>
%131 = vector.insert %130, %129 [1] : i32 into vector<4xi32>
%132 = vector.extract %127[2, 0] : vector<4x1xi32>
%133 = vector.insert %132, %131 [2] : i32 into vector<4xi32>
%134 = vector.extract %127[3, 0] : vector<4x1xi32>
%135 = vector.insert %134, %133 [3] : i32 into vector<4xi32>
%136 = vector.extract %arg9[0] : vector<4xi32>
%137 = arith.muli %32, %135 : vector<4xi32>
%138 = vector.reduction <add>, %137, %136 : vector<4xi32> into i32
%139 = vector.insert %138, %cst_0 [0] : i32 into vector<1xi32>
%140 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%141 = vector.insert %140, %cst [0] : vector<1xi8> into vector<4x1xi8>
%142 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%143 = vector.insert %142, %141 [1] : vector<1xi8> into vector<4x1xi8>
%144 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%145 = vector.insert %144, %143 [2] : vector<1xi8> into vector<4x1xi8>
%146 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%147 = vector.insert %146, %145 [3] : vector<1xi8> into vector<4x1xi8>
%148 = arith.extsi %147 : vector<4x1xi8> to vector<4x1xi32>
%149 = vector.extract %148[0, 0] : vector<4x1xi32>
%150 = vector.insert %149, %cst_1 [0] : i32 into vector<4xi32>
%151 = vector.extract %148[1, 0] : vector<4x1xi32>
%152 = vector.insert %151, %150 [1] : i32 into vector<4xi32>
%153 = vector.extract %148[2, 0] : vector<4x1xi32>
%154 = vector.insert %153, %152 [2] : i32 into vector<4xi32>
%155 = vector.extract %148[3, 0] : vector<4x1xi32>
%156 = vector.insert %155, %154 [3] : i32 into vector<4xi32>
%157 = vector.extract %arg9[1] : vector<4xi32>
%158 = arith.muli %32, %156 : vector<4xi32>
%159 = vector.reduction <add>, %158, %157 : vector<4xi32> into i32
%160 = vector.insert %159, %cst_0 [0] : i32 into vector<1xi32>
%161 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%162 = vector.insert %161, %cst [0] : vector<1xi8> into vector<4x1xi8>
%163 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%164 = vector.insert %163, %162 [1] : vector<1xi8> into vector<4x1xi8>
%165 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%166 = vector.insert %165, %164 [2] : vector<1xi8> into vector<4x1xi8>
%167 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%168 = vector.insert %167, %166 [3] : vector<1xi8> into vector<4x1xi8>
%169 = arith.extsi %168 : vector<4x1xi8> to vector<4x1xi32>
%170 = vector.extract %169[0, 0] : vector<4x1xi32>
%171 = vector.insert %170, %cst_1 [0] : i32 into vector<4xi32>
%172 = vector.extract %169[1, 0] : vector<4x1xi32>
%173 = vector.insert %172, %171 [1] : i32 into vector<4xi32>
%174 = vector.extract %169[2, 0] : vector<4x1xi32>
%175 = vector.insert %174, %173 [2] : i32 into vector<4xi32>
%176 = vector.extract %169[3, 0] : vector<4x1xi32>
%177 = vector.insert %176, %175 [3] : i32 into vector<4xi32>
%178 = vector.extract %arg9[2] : vector<4xi32>
%179 = arith.muli %32, %177 : vector<4xi32>
%180 = vector.reduction <add>, %179, %178 : vector<4xi32> into i32
%181 = vector.insert %180, %cst_0 [0] : i32 into vector<1xi32>
%182 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%183 = vector.insert %182, %cst [0] : vector<1xi8> into vector<4x1xi8>
%184 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%185 = vector.insert %184, %183 [1] : vector<1xi8> into vector<4x1xi8>
%186 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%187 = vector.insert %186, %185 [2] : vector<1xi8> into vector<4x1xi8>
%188 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%189 = vector.insert %188, %187 [3] : vector<1xi8> into vector<4x1xi8>
%190 = arith.extsi %189 : vector<4x1xi8> to vector<4x1xi32>
%191 = vector.extract %190[0, 0] : vector<4x1xi32>
%192 = vector.insert %191, %cst_1 [0] : i32 into vector<4xi32>
%193 = vector.extract %190[1, 0] : vector<4x1xi32>
%194 = vector.insert %193, %192 [1] : i32 into vector<4xi32>
%195 = vector.extract %190[2, 0] : vector<4x1xi32>
%196 = vector.insert %195, %194 [2] : i32 into vector<4xi32>
%197 = vector.extract %190[3, 0] : vector<4x1xi32>
%198 = vector.insert %197, %196 [3] : i32 into vector<4xi32>
%199 = vector.extract %arg9[3] : vector<4xi32>
%200 = arith.muli %32, %198 : vector<4xi32>
%201 = vector.reduction <add>, %200, %199 : vector<4xi32> into i32
%202 = vector.insert %201, %cst_0 [0] : i32 into vector<1xi32>
%203 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%204 = vector.insert %203, %cst [0] : vector<1xi8> into vector<4x1xi8>
%205 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%206 = vector.insert %205, %204 [1] : vector<1xi8> into vector<4x1xi8>
%207 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%208 = vector.insert %207, %206 [2] : vector<1xi8> into vector<4x1xi8>
%209 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%210 = vector.insert %209, %208 [3] : vector<1xi8> into vector<4x1xi8>
%211 = arith.extsi %210 : vector<4x1xi8> to vector<4x1xi32>
%212 = vector.extract %211[0, 0] : vector<4x1xi32>
%213 = vector.insert %212, %cst_1 [0] : i32 into vector<4xi32>
%214 = vector.extract %211[1, 0] : vector<4x1xi32>
%215 = vector.insert %214, %213 [1] : i32 into vector<4xi32>
%216 = vector.extract %211[2, 0] : vector<4x1xi32>
%217 = vector.insert %216, %215 [2] : i32 into vector<4xi32>
%218 = vector.extract %211[3, 0] : vector<4x1xi32>
%219 = vector.insert %218, %217 [3] : i32 into vector<4xi32>
%220 = vector.extract %arg8[0] : vector<4xi32>
%221 = arith.muli %33, %219 : vector<4xi32>
%222 = vector.reduction <add>, %221, %220 : vector<4xi32> into i32
%223 = vector.insert %222, %cst_0 [0] : i32 into vector<1xi32>
%224 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%225 = vector.insert %224, %cst [0] : vector<1xi8> into vector<4x1xi8>
%226 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%227 = vector.insert %226, %225 [1] : vector<1xi8> into vector<4x1xi8>
%228 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%229 = vector.insert %228, %227 [2] : vector<1xi8> into vector<4x1xi8>
%230 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%231 = vector.insert %230, %229 [3] : vector<1xi8> into vector<4x1xi8>
%232 = arith.extsi %231 : vector<4x1xi8> to vector<4x1xi32>
%233 = vector.extract %232[0, 0] : vector<4x1xi32>
%234 = vector.insert %233, %cst_1 [0] : i32 into vector<4xi32>
%235 = vector.extract %232[1, 0] : vector<4x1xi32>
%236 = vector.insert %235, %234 [1] : i32 into vector<4xi32>
%237 = vector.extract %232[2, 0] : vector<4x1xi32>
%238 = vector.insert %237, %236 [2] : i32 into vector<4xi32>
%239 = vector.extract %232[3, 0] : vector<4x1xi32>
%240 = vector.insert %239, %238 [3] : i32 into vector<4xi32>
%241 = vector.extract %arg8[1] : vector<4xi32>
%242 = arith.muli %33, %240 : vector<4xi32>
%243 = vector.reduction <add>, %242, %241 : vector<4xi32> into i32
%244 = vector.insert %243, %cst_0 [0] : i32 into vector<1xi32>
%245 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%246 = vector.insert %245, %cst [0] : vector<1xi8> into vector<4x1xi8>
%247 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%248 = vector.insert %247, %246 [1] : vector<1xi8> into vector<4x1xi8>
%249 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%250 = vector.insert %249, %248 [2] : vector<1xi8> into vector<4x1xi8>
%251 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%252 = vector.insert %251, %250 [3] : vector<1xi8> into vector<4x1xi8>
%253 = arith.extsi %252 : vector<4x1xi8> to vector<4x1xi32>
%254 = vector.extract %253[0, 0] : vector<4x1xi32>
%255 = vector.insert %254, %cst_1 [0] : i32 into vector<4xi32>
%256 = vector.extract %253[1, 0] : vector<4x1xi32>
%257 = vector.insert %256, %255 [1] : i32 into vector<4xi32>
%258 = vector.extract %253[2, 0] : vector<4x1xi32>
%259 = vector.insert %258, %257 [2] : i32 into vector<4xi32>
%260 = vector.extract %253[3, 0] : vector<4x1xi32>
%261 = vector.insert %260, %259 [3] : i32 into vector<4xi32>
%262 = vector.extract %arg8[2] : vector<4xi32>
%263 = arith.muli %33, %261 : vector<4xi32>
%264 = vector.reduction <add>, %263, %262 : vector<4xi32> into i32
%265 = vector.insert %264, %cst_0 [0] : i32 into vector<1xi32>
%266 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%267 = vector.insert %266, %cst [0] : vector<1xi8> into vector<4x1xi8>
%268 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%269 = vector.insert %268, %267 [1] : vector<1xi8> into vector<4x1xi8>
%270 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%271 = vector.insert %270, %269 [2] : vector<1xi8> into vector<4x1xi8>
%272 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%273 = vector.insert %272, %271 [3] : vector<1xi8> into vector<4x1xi8>
%274 = arith.extsi %273 : vector<4x1xi8> to vector<4x1xi32>
%275 = vector.extract %274[0, 0] : vector<4x1xi32>
%276 = vector.insert %275, %cst_1 [0] : i32 into vector<4xi32>
%277 = vector.extract %274[1, 0] : vector<4x1xi32>
%278 = vector.insert %277, %276 [1] : i32 into vector<4xi32>
%279 = vector.extract %274[2, 0] : vector<4x1xi32>
%280 = vector.insert %279, %278 [2] : i32 into vector<4xi32>
%281 = vector.extract %274[3, 0] : vector<4x1xi32>
%282 = vector.insert %281, %280 [3] : i32 into vector<4xi32>
%283 = vector.extract %arg8[3] : vector<4xi32>
%284 = arith.muli %33, %282 : vector<4xi32>
%285 = vector.reduction <add>, %284, %283 : vector<4xi32> into i32
%286 = vector.insert %285, %cst_0 [0] : i32 into vector<1xi32>
%287 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%288 = vector.insert %287, %cst [0] : vector<1xi8> into vector<4x1xi8>
%289 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%290 = vector.insert %289, %288 [1] : vector<1xi8> into vector<4x1xi8>
%291 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%292 = vector.insert %291, %290 [2] : vector<1xi8> into vector<4x1xi8>
%293 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%294 = vector.insert %293, %292 [3] : vector<1xi8> into vector<4x1xi8>
%295 = arith.extsi %294 : vector<4x1xi8> to vector<4x1xi32>
%296 = vector.extract %295[0, 0] : vector<4x1xi32>
%297 = vector.insert %296, %cst_1 [0] : i32 into vector<4xi32>
%298 = vector.extract %295[1, 0] : vector<4x1xi32>
%299 = vector.insert %298, %297 [1] : i32 into vector<4xi32>
%300 = vector.extract %295[2, 0] : vector<4x1xi32>
%301 = vector.insert %300, %299 [2] : i32 into vector<4xi32>
%302 = vector.extract %295[3, 0] : vector<4x1xi32>
%303 = vector.insert %302, %301 [3] : i32 into vector<4xi32>
%304 = vector.extract %arg7[0] : vector<4xi32>
%305 = arith.muli %34, %303 : vector<4xi32>
%306 = vector.reduction <add>, %305, %304 : vector<4xi32> into i32
%307 = vector.insert %306, %cst_0 [0] : i32 into vector<1xi32>
%308 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%309 = vector.insert %308, %cst [0] : vector<1xi8> into vector<4x1xi8>
%310 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%311 = vector.insert %310, %309 [1] : vector<1xi8> into vector<4x1xi8>
%312 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%313 = vector.insert %312, %311 [2] : vector<1xi8> into vector<4x1xi8>
%314 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%315 = vector.insert %314, %313 [3] : vector<1xi8> into vector<4x1xi8>
%316 = arith.extsi %315 : vector<4x1xi8> to vector<4x1xi32>
%317 = vector.extract %316[0, 0] : vector<4x1xi32>
%318 = vector.insert %317, %cst_1 [0] : i32 into vector<4xi32>
%319 = vector.extract %316[1, 0] : vector<4x1xi32>
%320 = vector.insert %319, %318 [1] : i32 into vector<4xi32>
%321 = vector.extract %316[2, 0] : vector<4x1xi32>
%322 = vector.insert %321, %320 [2] : i32 into vector<4xi32>
%323 = vector.extract %316[3, 0] : vector<4x1xi32>
%324 = vector.insert %323, %322 [3] : i32 into vector<4xi32>
%325 = vector.extract %arg7[1] : vector<4xi32>
%326 = arith.muli %34, %324 : vector<4xi32>
%327 = vector.reduction <add>, %326, %325 : vector<4xi32> into i32
%328 = vector.insert %327, %cst_0 [0] : i32 into vector<1xi32>
%329 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%330 = vector.insert %329, %cst [0] : vector<1xi8> into vector<4x1xi8>
%331 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%332 = vector.insert %331, %330 [1] : vector<1xi8> into vector<4x1xi8>
%333 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%334 = vector.insert %333, %332 [2] : vector<1xi8> into vector<4x1xi8>
%335 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%336 = vector.insert %335, %334 [3] : vector<1xi8> into vector<4x1xi8>
%337 = arith.extsi %336 : vector<4x1xi8> to vector<4x1xi32>
%338 = vector.extract %337[0, 0] : vector<4x1xi32>
%339 = vector.insert %338, %cst_1 [0] : i32 into vector<4xi32>
%340 = vector.extract %337[1, 0] : vector<4x1xi32>
%341 = vector.insert %340, %339 [1] : i32 into vector<4xi32>
%342 = vector.extract %337[2, 0] : vector<4x1xi32>
%343 = vector.insert %342, %341 [2] : i32 into vector<4xi32>
%344 = vector.extract %337[3, 0] : vector<4x1xi32>
%345 = vector.insert %344, %343 [3] : i32 into vector<4xi32>
%346 = vector.extract %arg7[2] : vector<4xi32>
%347 = arith.muli %34, %345 : vector<4xi32>
%348 = vector.reduction <add>, %347, %346 : vector<4xi32> into i32
%349 = vector.insert %348, %cst_0 [0] : i32 into vector<1xi32>
%350 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%351 = vector.insert %350, %cst [0] : vector<1xi8> into vector<4x1xi8>
%352 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%353 = vector.insert %352, %351 [1] : vector<1xi8> into vector<4x1xi8>
%354 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%355 = vector.insert %354, %353 [2] : vector<1xi8> into vector<4x1xi8>
%356 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%357 = vector.insert %356, %355 [3] : vector<1xi8> into vector<4x1xi8>
%358 = arith.extsi %357 : vector<4x1xi8> to vector<4x1xi32>
%359 = vector.extract %358[0, 0] : vector<4x1xi32>
%360 = vector.insert %359, %cst_1 [0] : i32 into vector<4xi32>
%361 = vector.extract %358[1, 0] : vector<4x1xi32>
%362 = vector.insert %361, %360 [1] : i32 into vector<4xi32>
%363 = vector.extract %358[2, 0] : vector<4x1xi32>
%364 = vector.insert %363, %362 [2] : i32 into vector<4xi32>
%365 = vector.extract %358[3, 0] : vector<4x1xi32>
%366 = vector.insert %365, %364 [3] : i32 into vector<4xi32>
%367 = vector.extract %arg7[3] : vector<4xi32>
%368 = arith.muli %34, %366 : vector<4xi32>
%369 = vector.reduction <add>, %368, %367 : vector<4xi32> into i32
%370 = vector.insert %369, %cst_0 [0] : i32 into vector<1xi32>
%371 = vector.insert_strided_slice %55, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%372 = vector.insert_strided_slice %76, %371 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%373 = vector.insert_strided_slice %97, %372 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%374 = vector.insert_strided_slice %118, %373 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%375 = vector.insert_strided_slice %139, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%376 = vector.insert_strided_slice %160, %375 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%377 = vector.insert_strided_slice %181, %376 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%378 = vector.insert_strided_slice %202, %377 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%379 = vector.insert_strided_slice %223, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%380 = vector.insert_strided_slice %244, %379 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%381 = vector.insert_strided_slice %265, %380 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%382 = vector.insert_strided_slice %286, %381 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%383 = vector.insert_strided_slice %307, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%384 = vector.insert_strided_slice %328, %383 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%385 = vector.insert_strided_slice %349, %384 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%386 = vector.insert_strided_slice %370, %385 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
scf.yield %386, %382, %378, %374 : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>
}
%16 = vector.transfer_write %15#3, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%17 = vector.transfer_write %15#2, %16[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%18 = vector.transfer_write %15#1, %17[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%19 = vector.transfer_write %15#0, %18[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%inserted_slice = tensor.insert_slice %19 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %14 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
}
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%cst = arith.constant dense<0> : vector<4x1xi8>
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<4xi32>
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c0_i8 = arith.constant 0 : i8
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c1024 step %4 {
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
scf.for %arg1 = %5 to %c1024 step %6 {
%8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %8) -> (tensor<8x32xi32>) {
%11 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
%12 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2)
%14 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%15:4 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %cst_1, %arg8 = %cst_1, %arg9 = %cst_1, %arg10 = %cst_1) -> (vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>) {
%20 = vector.transfer_read %7[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%21 = vector.transfer_read %7[%11, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%22 = vector.transfer_read %7[%12, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%23 = vector.transfer_read %7[%13, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%24 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%25 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6)
%26 = vector.transfer_read %9[%25, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%27 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6)
%28 = vector.transfer_read %9[%27, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%29 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6)
%30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%31 = arith.extsi %20 : vector<4xi8> to vector<4xi32>
%32 = arith.extsi %21 : vector<4xi8> to vector<4xi32>
%33 = arith.extsi %22 : vector<4xi8> to vector<4xi32>
%34 = arith.extsi %23 : vector<4xi8> to vector<4xi32>
%35 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%36 = vector.insert %35, %cst [0] : vector<1xi8> into vector<4x1xi8>
%37 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%38 = vector.insert %37, %36 [1] : vector<1xi8> into vector<4x1xi8>
%39 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%40 = vector.insert %39, %38 [2] : vector<1xi8> into vector<4x1xi8>
%41 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%42 = vector.insert %41, %40 [3] : vector<1xi8> into vector<4x1xi8>
%43 = arith.extsi %42 : vector<4x1xi8> to vector<4x1xi32>
%44 = vector.extract %43[0, 0] : vector<4x1xi32>
%45 = vector.insert %44, %cst_1 [0] : i32 into vector<4xi32>
%46 = vector.extract %43[1, 0] : vector<4x1xi32>
%47 = vector.insert %46, %45 [1] : i32 into vector<4xi32>
%48 = vector.extract %43[2, 0] : vector<4x1xi32>
%49 = vector.insert %48, %47 [2] : i32 into vector<4xi32>
%50 = vector.extract %43[3, 0] : vector<4x1xi32>
%51 = vector.insert %50, %49 [3] : i32 into vector<4xi32>
%52 = vector.extract %arg10[0] : vector<4xi32>
%53 = arith.muli %31, %51 : vector<4xi32>
%54 = vector.reduction <add>, %53, %52 : vector<4xi32> into i32
%55 = vector.insert %54, %cst_0 [0] : i32 into vector<1xi32>
%56 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%57 = vector.insert %56, %cst [0] : vector<1xi8> into vector<4x1xi8>
%58 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%59 = vector.insert %58, %57 [1] : vector<1xi8> into vector<4x1xi8>
%60 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%61 = vector.insert %60, %59 [2] : vector<1xi8> into vector<4x1xi8>
%62 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%63 = vector.insert %62, %61 [3] : vector<1xi8> into vector<4x1xi8>
%64 = arith.extsi %63 : vector<4x1xi8> to vector<4x1xi32>
%65 = vector.extract %64[0, 0] : vector<4x1xi32>
%66 = vector.insert %65, %cst_1 [0] : i32 into vector<4xi32>
%67 = vector.extract %64[1, 0] : vector<4x1xi32>
%68 = vector.insert %67, %66 [1] : i32 into vector<4xi32>
%69 = vector.extract %64[2, 0] : vector<4x1xi32>
%70 = vector.insert %69, %68 [2] : i32 into vector<4xi32>
%71 = vector.extract %64[3, 0] : vector<4x1xi32>
%72 = vector.insert %71, %70 [3] : i32 into vector<4xi32>
%73 = vector.extract %arg10[1] : vector<4xi32>
%74 = arith.muli %31, %72 : vector<4xi32>
%75 = vector.reduction <add>, %74, %73 : vector<4xi32> into i32
%76 = vector.insert %75, %cst_0 [0] : i32 into vector<1xi32>
%77 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%78 = vector.insert %77, %cst [0] : vector<1xi8> into vector<4x1xi8>
%79 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%80 = vector.insert %79, %78 [1] : vector<1xi8> into vector<4x1xi8>
%81 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%82 = vector.insert %81, %80 [2] : vector<1xi8> into vector<4x1xi8>
%83 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%84 = vector.insert %83, %82 [3] : vector<1xi8> into vector<4x1xi8>
%85 = arith.extsi %84 : vector<4x1xi8> to vector<4x1xi32>
%86 = vector.extract %85[0, 0] : vector<4x1xi32>
%87 = vector.insert %86, %cst_1 [0] : i32 into vector<4xi32>
%88 = vector.extract %85[1, 0] : vector<4x1xi32>
%89 = vector.insert %88, %87 [1] : i32 into vector<4xi32>
%90 = vector.extract %85[2, 0] : vector<4x1xi32>
%91 = vector.insert %90, %89 [2] : i32 into vector<4xi32>
%92 = vector.extract %85[3, 0] : vector<4x1xi32>
%93 = vector.insert %92, %91 [3] : i32 into vector<4xi32>
%94 = vector.extract %arg10[2] : vector<4xi32>
%95 = arith.muli %31, %93 : vector<4xi32>
%96 = vector.reduction <add>, %95, %94 : vector<4xi32> into i32
%97 = vector.insert %96, %cst_0 [0] : i32 into vector<1xi32>
%98 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%99 = vector.insert %98, %cst [0] : vector<1xi8> into vector<4x1xi8>
%100 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%101 = vector.insert %100, %99 [1] : vector<1xi8> into vector<4x1xi8>
%102 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%103 = vector.insert %102, %101 [2] : vector<1xi8> into vector<4x1xi8>
%104 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%105 = vector.insert %104, %103 [3] : vector<1xi8> into vector<4x1xi8>
%106 = arith.extsi %105 : vector<4x1xi8> to vector<4x1xi32>
%107 = vector.extract %106[0, 0] : vector<4x1xi32>
%108 = vector.insert %107, %cst_1 [0] : i32 into vector<4xi32>
%109 = vector.extract %106[1, 0] : vector<4x1xi32>
%110 = vector.insert %109, %108 [1] : i32 into vector<4xi32>
%111 = vector.extract %106[2, 0] : vector<4x1xi32>
%112 = vector.insert %111, %110 [2] : i32 into vector<4xi32>
%113 = vector.extract %106[3, 0] : vector<4x1xi32>
%114 = vector.insert %113, %112 [3] : i32 into vector<4xi32>
%115 = vector.extract %arg10[3] : vector<4xi32>
%116 = arith.muli %31, %114 : vector<4xi32>
%117 = vector.reduction <add>, %116, %115 : vector<4xi32> into i32
%118 = vector.insert %117, %cst_0 [0] : i32 into vector<1xi32>
%119 = vector.extract %arg9[0] : vector<4xi32>
%120 = arith.muli %32, %51 : vector<4xi32>
%121 = vector.reduction <add>, %120, %119 : vector<4xi32> into i32
%122 = vector.insert %121, %cst_0 [0] : i32 into vector<1xi32>
%123 = vector.extract %arg9[1] : vector<4xi32>
%124 = arith.muli %32, %72 : vector<4xi32>
%125 = vector.reduction <add>, %124, %123 : vector<4xi32> into i32
%126 = vector.insert %125, %cst_0 [0] : i32 into vector<1xi32>
%127 = vector.extract %arg9[2] : vector<4xi32>
%128 = arith.muli %32, %93 : vector<4xi32>
%129 = vector.reduction <add>, %128, %127 : vector<4xi32> into i32
%130 = vector.insert %129, %cst_0 [0] : i32 into vector<1xi32>
%131 = vector.extract %arg9[3] : vector<4xi32>
%132 = arith.muli %32, %114 : vector<4xi32>
%133 = vector.reduction <add>, %132, %131 : vector<4xi32> into i32
%134 = vector.insert %133, %cst_0 [0] : i32 into vector<1xi32>
%135 = vector.extract %arg8[0] : vector<4xi32>
%136 = arith.muli %33, %51 : vector<4xi32>
%137 = vector.reduction <add>, %136, %135 : vector<4xi32> into i32
%138 = vector.insert %137, %cst_0 [0] : i32 into vector<1xi32>
%139 = vector.extract %arg8[1] : vector<4xi32>
%140 = arith.muli %33, %72 : vector<4xi32>
%141 = vector.reduction <add>, %140, %139 : vector<4xi32> into i32
%142 = vector.insert %141, %cst_0 [0] : i32 into vector<1xi32>
%143 = vector.extract %arg8[2] : vector<4xi32>
%144 = arith.muli %33, %93 : vector<4xi32>
%145 = vector.reduction <add>, %144, %143 : vector<4xi32> into i32
%146 = vector.insert %145, %cst_0 [0] : i32 into vector<1xi32>
%147 = vector.extract %arg8[3] : vector<4xi32>
%148 = arith.muli %33, %114 : vector<4xi32>
%149 = vector.reduction <add>, %148, %147 : vector<4xi32> into i32
%150 = vector.insert %149, %cst_0 [0] : i32 into vector<1xi32>
%151 = vector.extract %arg7[0] : vector<4xi32>
%152 = arith.muli %34, %51 : vector<4xi32>
%153 = vector.reduction <add>, %152, %151 : vector<4xi32> into i32
%154 = vector.insert %153, %cst_0 [0] : i32 into vector<1xi32>
%155 = vector.extract %arg7[1] : vector<4xi32>
%156 = arith.muli %34, %72 : vector<4xi32>
%157 = vector.reduction <add>, %156, %155 : vector<4xi32> into i32
%158 = vector.insert %157, %cst_0 [0] : i32 into vector<1xi32>
%159 = vector.extract %arg7[2] : vector<4xi32>
%160 = arith.muli %34, %93 : vector<4xi32>
%161 = vector.reduction <add>, %160, %159 : vector<4xi32> into i32
%162 = vector.insert %161, %cst_0 [0] : i32 into vector<1xi32>
%163 = vector.extract %arg7[3] : vector<4xi32>
%164 = arith.muli %34, %114 : vector<4xi32>
%165 = vector.reduction <add>, %164, %163 : vector<4xi32> into i32
%166 = vector.insert %165, %cst_0 [0] : i32 into vector<1xi32>
%167 = vector.insert_strided_slice %55, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%168 = vector.insert_strided_slice %76, %167 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%169 = vector.insert_strided_slice %97, %168 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%170 = vector.insert_strided_slice %118, %169 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%171 = vector.insert_strided_slice %122, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%172 = vector.insert_strided_slice %126, %171 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%173 = vector.insert_strided_slice %130, %172 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%174 = vector.insert_strided_slice %134, %173 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%175 = vector.insert_strided_slice %138, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%176 = vector.insert_strided_slice %142, %175 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%177 = vector.insert_strided_slice %146, %176 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%178 = vector.insert_strided_slice %150, %177 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%179 = vector.insert_strided_slice %154, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%180 = vector.insert_strided_slice %158, %179 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%181 = vector.insert_strided_slice %162, %180 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%182 = vector.insert_strided_slice %166, %181 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
scf.yield %182, %178, %174, %170 : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>
}
%16 = vector.transfer_write %15#3, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%17 = vector.transfer_write %15#2, %16[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%18 = vector.transfer_write %15#1, %17[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%19 = vector.transfer_write %15#0, %18[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%inserted_slice = tensor.insert_slice %19 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %14 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
}
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
module {
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%cst = arith.constant dense<0> : vector<4x1xi8>
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<4xi32>
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c0_i8 = arith.constant 0 : i8
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c1024 step %4 {
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
scf.for %arg1 = %5 to %c1024 step %6 {
%8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %8) -> (tensor<8x32xi32>) {
%11 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
%12 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2)
%14 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%15:4 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %cst_1, %arg8 = %cst_1, %arg9 = %cst_1, %arg10 = %cst_1) -> (vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>) {
%20 = vector.transfer_read %7[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%21 = vector.transfer_read %7[%11, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%22 = vector.transfer_read %7[%12, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%23 = vector.transfer_read %7[%13, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%24 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%25 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6)
%26 = vector.transfer_read %9[%25, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%27 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6)
%28 = vector.transfer_read %9[%27, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%29 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6)
%30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%31 = arith.extsi %20 : vector<4xi8> to vector<4xi32>
%32 = arith.extsi %21 : vector<4xi8> to vector<4xi32>
%33 = arith.extsi %22 : vector<4xi8> to vector<4xi32>
%34 = arith.extsi %23 : vector<4xi8> to vector<4xi32>
%35 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%36 = vector.insert %35, %cst [0] : vector<1xi8> into vector<4x1xi8>
%37 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%38 = vector.insert %37, %36 [1] : vector<1xi8> into vector<4x1xi8>
%39 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%40 = vector.insert %39, %38 [2] : vector<1xi8> into vector<4x1xi8>
%41 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%42 = vector.insert %41, %40 [3] : vector<1xi8> into vector<4x1xi8>
%43 = arith.extsi %42 : vector<4x1xi8> to vector<4x1xi32>
%44 = vector.extract %43[0, 0] : vector<4x1xi32>
%45 = vector.insert %44, %cst_1 [0] : i32 into vector<4xi32>
%46 = vector.extract %43[1, 0] : vector<4x1xi32>
%47 = vector.insert %46, %45 [1] : i32 into vector<4xi32>
%48 = vector.extract %43[2, 0] : vector<4x1xi32>
%49 = vector.insert %48, %47 [2] : i32 into vector<4xi32>
%50 = vector.extract %43[3, 0] : vector<4x1xi32>
%51 = vector.insert %50, %49 [3] : i32 into vector<4xi32>
%52 = vector.extract %arg10[0] : vector<4xi32>
%53 = arith.muli %31, %51 : vector<4xi32>
%54 = vector.reduction <add>, %53, %52 : vector<4xi32> into i32
%55 = vector.insert %54, %cst_0 [0] : i32 into vector<1xi32>
%56 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%57 = vector.insert %56, %cst [0] : vector<1xi8> into vector<4x1xi8>
%58 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%59 = vector.insert %58, %57 [1] : vector<1xi8> into vector<4x1xi8>
%60 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%61 = vector.insert %60, %59 [2] : vector<1xi8> into vector<4x1xi8>
%62 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%63 = vector.insert %62, %61 [3] : vector<1xi8> into vector<4x1xi8>
%64 = arith.extsi %63 : vector<4x1xi8> to vector<4x1xi32>
%65 = vector.extract %64[0, 0] : vector<4x1xi32>
%66 = vector.insert %65, %cst_1 [0] : i32 into vector<4xi32>
%67 = vector.extract %64[1, 0] : vector<4x1xi32>
%68 = vector.insert %67, %66 [1] : i32 into vector<4xi32>
%69 = vector.extract %64[2, 0] : vector<4x1xi32>
%70 = vector.insert %69, %68 [2] : i32 into vector<4xi32>
%71 = vector.extract %64[3, 0] : vector<4x1xi32>
%72 = vector.insert %71, %70 [3] : i32 into vector<4xi32>
%73 = vector.extract %arg10[1] : vector<4xi32>
%74 = arith.muli %31, %72 : vector<4xi32>
%75 = vector.reduction <add>, %74, %73 : vector<4xi32> into i32
%76 = vector.insert %75, %cst_0 [0] : i32 into vector<1xi32>
%77 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%78 = vector.insert %77, %cst [0] : vector<1xi8> into vector<4x1xi8>
%79 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%80 = vector.insert %79, %78 [1] : vector<1xi8> into vector<4x1xi8>
%81 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%82 = vector.insert %81, %80 [2] : vector<1xi8> into vector<4x1xi8>
%83 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%84 = vector.insert %83, %82 [3] : vector<1xi8> into vector<4x1xi8>
%85 = arith.extsi %84 : vector<4x1xi8> to vector<4x1xi32>
%86 = vector.extract %85[0, 0] : vector<4x1xi32>
%87 = vector.insert %86, %cst_1 [0] : i32 into vector<4xi32>
%88 = vector.extract %85[1, 0] : vector<4x1xi32>
%89 = vector.insert %88, %87 [1] : i32 into vector<4xi32>
%90 = vector.extract %85[2, 0] : vector<4x1xi32>
%91 = vector.insert %90, %89 [2] : i32 into vector<4xi32>
%92 = vector.extract %85[3, 0] : vector<4x1xi32>
%93 = vector.insert %92, %91 [3] : i32 into vector<4xi32>
%94 = vector.extract %arg10[2] : vector<4xi32>
%95 = arith.muli %31, %93 : vector<4xi32>
%96 = vector.reduction <add>, %95, %94 : vector<4xi32> into i32
%97 = vector.insert %96, %cst_0 [0] : i32 into vector<1xi32>
%98 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%99 = vector.insert %98, %cst [0] : vector<1xi8> into vector<4x1xi8>
%100 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%101 = vector.insert %100, %99 [1] : vector<1xi8> into vector<4x1xi8>
%102 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%103 = vector.insert %102, %101 [2] : vector<1xi8> into vector<4x1xi8>
%104 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%105 = vector.insert %104, %103 [3] : vector<1xi8> into vector<4x1xi8>
%106 = arith.extsi %105 : vector<4x1xi8> to vector<4x1xi32>
%107 = vector.extract %106[0, 0] : vector<4x1xi32>
%108 = vector.insert %107, %cst_1 [0] : i32 into vector<4xi32>
%109 = vector.extract %106[1, 0] : vector<4x1xi32>
%110 = vector.insert %109, %108 [1] : i32 into vector<4xi32>
%111 = vector.extract %106[2, 0] : vector<4x1xi32>
%112 = vector.insert %111, %110 [2] : i32 into vector<4xi32>
%113 = vector.extract %106[3, 0] : vector<4x1xi32>
%114 = vector.insert %113, %112 [3] : i32 into vector<4xi32>
%115 = vector.extract %arg10[3] : vector<4xi32>
%116 = arith.muli %31, %114 : vector<4xi32>
%117 = vector.reduction <add>, %116, %115 : vector<4xi32> into i32
%118 = vector.insert %117, %cst_0 [0] : i32 into vector<1xi32>
%119 = vector.extract %arg9[0] : vector<4xi32>
%120 = arith.muli %32, %51 : vector<4xi32>
%121 = vector.reduction <add>, %120, %119 : vector<4xi32> into i32
%122 = vector.insert %121, %cst_0 [0] : i32 into vector<1xi32>
%123 = vector.extract %arg9[1] : vector<4xi32>
%124 = arith.muli %32, %72 : vector<4xi32>
%125 = vector.reduction <add>, %124, %123 : vector<4xi32> into i32
%126 = vector.insert %125, %cst_0 [0] : i32 into vector<1xi32>
%127 = vector.extract %arg9[2] : vector<4xi32>
%128 = arith.muli %32, %93 : vector<4xi32>
%129 = vector.reduction <add>, %128, %127 : vector<4xi32> into i32
%130 = vector.insert %129, %cst_0 [0] : i32 into vector<1xi32>
%131 = vector.extract %arg9[3] : vector<4xi32>
%132 = arith.muli %32, %114 : vector<4xi32>
%133 = vector.reduction <add>, %132, %131 : vector<4xi32> into i32
%134 = vector.insert %133, %cst_0 [0] : i32 into vector<1xi32>
%135 = vector.extract %arg8[0] : vector<4xi32>
%136 = arith.muli %33, %51 : vector<4xi32>
%137 = vector.reduction <add>, %136, %135 : vector<4xi32> into i32
%138 = vector.insert %137, %cst_0 [0] : i32 into vector<1xi32>
%139 = vector.extract %arg8[1] : vector<4xi32>
%140 = arith.muli %33, %72 : vector<4xi32>
%141 = vector.reduction <add>, %140, %139 : vector<4xi32> into i32
%142 = vector.insert %141, %cst_0 [0] : i32 into vector<1xi32>
%143 = vector.extract %arg8[2] : vector<4xi32>
%144 = arith.muli %33, %93 : vector<4xi32>
%145 = vector.reduction <add>, %144, %143 : vector<4xi32> into i32
%146 = vector.insert %145, %cst_0 [0] : i32 into vector<1xi32>
%147 = vector.extract %arg8[3] : vector<4xi32>
%148 = arith.muli %33, %114 : vector<4xi32>
%149 = vector.reduction <add>, %148, %147 : vector<4xi32> into i32
%150 = vector.insert %149, %cst_0 [0] : i32 into vector<1xi32>
%151 = vector.extract %arg7[0] : vector<4xi32>
%152 = arith.muli %34, %51 : vector<4xi32>
%153 = vector.reduction <add>, %152, %151 : vector<4xi32> into i32
%154 = vector.insert %153, %cst_0 [0] : i32 into vector<1xi32>
%155 = vector.extract %arg7[1] : vector<4xi32>
%156 = arith.muli %34, %72 : vector<4xi32>
%157 = vector.reduction <add>, %156, %155 : vector<4xi32> into i32
%158 = vector.insert %157, %cst_0 [0] : i32 into vector<1xi32>
%159 = vector.extract %arg7[2] : vector<4xi32>
%160 = arith.muli %34, %93 : vector<4xi32>
%161 = vector.reduction <add>, %160, %159 : vector<4xi32> into i32
%162 = vector.insert %161, %cst_0 [0] : i32 into vector<1xi32>
%163 = vector.extract %arg7[3] : vector<4xi32>
%164 = arith.muli %34, %114 : vector<4xi32>
%165 = vector.reduction <add>, %164, %163 : vector<4xi32> into i32
%166 = vector.insert %165, %cst_0 [0] : i32 into vector<1xi32>
%167 = vector.insert_strided_slice %55, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%168 = vector.insert_strided_slice %76, %167 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%169 = vector.insert_strided_slice %97, %168 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%170 = vector.insert_strided_slice %118, %169 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%171 = vector.insert_strided_slice %122, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%172 = vector.insert_strided_slice %126, %171 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%173 = vector.insert_strided_slice %130, %172 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%174 = vector.insert_strided_slice %134, %173 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%175 = vector.insert_strided_slice %138, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%176 = vector.insert_strided_slice %142, %175 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%177 = vector.insert_strided_slice %146, %176 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%178 = vector.insert_strided_slice %150, %177 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%179 = vector.insert_strided_slice %154, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%180 = vector.insert_strided_slice %158, %179 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%181 = vector.insert_strided_slice %162, %180 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%182 = vector.insert_strided_slice %166, %181 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
scf.yield %182, %178, %174, %170 : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>
}
%16 = vector.transfer_write %15#3, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%17 = vector.transfer_write %15#2, %16[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%18 = vector.transfer_write %15#1, %17[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%19 = vector.transfer_write %15#0, %18[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%inserted_slice = tensor.insert_slice %19 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %14 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
}
// -----// IR Dump After EliminateEmptyTensors (iree-eliminate-empty-tensors) //----- //
module {
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%cst = arith.constant dense<0> : vector<4x1xi8>
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<4xi32>
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c0_i8 = arith.constant 0 : i8
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c1024 step %4 {
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
scf.for %arg1 = %5 to %c1024 step %6 {
%8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %8) -> (tensor<8x32xi32>) {
%11 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
%12 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2)
%14 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%15:4 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %cst_1, %arg8 = %cst_1, %arg9 = %cst_1, %arg10 = %cst_1) -> (vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>) {
%20 = vector.transfer_read %7[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%21 = vector.transfer_read %7[%11, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%22 = vector.transfer_read %7[%12, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%23 = vector.transfer_read %7[%13, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%24 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%25 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6)
%26 = vector.transfer_read %9[%25, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%27 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6)
%28 = vector.transfer_read %9[%27, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%29 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6)
%30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%31 = arith.extsi %20 : vector<4xi8> to vector<4xi32>
%32 = arith.extsi %21 : vector<4xi8> to vector<4xi32>
%33 = arith.extsi %22 : vector<4xi8> to vector<4xi32>
%34 = arith.extsi %23 : vector<4xi8> to vector<4xi32>
%35 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%36 = vector.insert %35, %cst [0] : vector<1xi8> into vector<4x1xi8>
%37 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%38 = vector.insert %37, %36 [1] : vector<1xi8> into vector<4x1xi8>
%39 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%40 = vector.insert %39, %38 [2] : vector<1xi8> into vector<4x1xi8>
%41 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%42 = vector.insert %41, %40 [3] : vector<1xi8> into vector<4x1xi8>
%43 = arith.extsi %42 : vector<4x1xi8> to vector<4x1xi32>
%44 = vector.extract %43[0, 0] : vector<4x1xi32>
%45 = vector.insert %44, %cst_1 [0] : i32 into vector<4xi32>
%46 = vector.extract %43[1, 0] : vector<4x1xi32>
%47 = vector.insert %46, %45 [1] : i32 into vector<4xi32>
%48 = vector.extract %43[2, 0] : vector<4x1xi32>
%49 = vector.insert %48, %47 [2] : i32 into vector<4xi32>
%50 = vector.extract %43[3, 0] : vector<4x1xi32>
%51 = vector.insert %50, %49 [3] : i32 into vector<4xi32>
%52 = vector.extract %arg10[0] : vector<4xi32>
%53 = arith.muli %31, %51 : vector<4xi32>
%54 = vector.reduction <add>, %53, %52 : vector<4xi32> into i32
%55 = vector.insert %54, %cst_0 [0] : i32 into vector<1xi32>
%56 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%57 = vector.insert %56, %cst [0] : vector<1xi8> into vector<4x1xi8>
%58 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%59 = vector.insert %58, %57 [1] : vector<1xi8> into vector<4x1xi8>
%60 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%61 = vector.insert %60, %59 [2] : vector<1xi8> into vector<4x1xi8>
%62 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%63 = vector.insert %62, %61 [3] : vector<1xi8> into vector<4x1xi8>
%64 = arith.extsi %63 : vector<4x1xi8> to vector<4x1xi32>
%65 = vector.extract %64[0, 0] : vector<4x1xi32>
%66 = vector.insert %65, %cst_1 [0] : i32 into vector<4xi32>
%67 = vector.extract %64[1, 0] : vector<4x1xi32>
%68 = vector.insert %67, %66 [1] : i32 into vector<4xi32>
%69 = vector.extract %64[2, 0] : vector<4x1xi32>
%70 = vector.insert %69, %68 [2] : i32 into vector<4xi32>
%71 = vector.extract %64[3, 0] : vector<4x1xi32>
%72 = vector.insert %71, %70 [3] : i32 into vector<4xi32>
%73 = vector.extract %arg10[1] : vector<4xi32>
%74 = arith.muli %31, %72 : vector<4xi32>
%75 = vector.reduction <add>, %74, %73 : vector<4xi32> into i32
%76 = vector.insert %75, %cst_0 [0] : i32 into vector<1xi32>
%77 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%78 = vector.insert %77, %cst [0] : vector<1xi8> into vector<4x1xi8>
%79 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%80 = vector.insert %79, %78 [1] : vector<1xi8> into vector<4x1xi8>
%81 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%82 = vector.insert %81, %80 [2] : vector<1xi8> into vector<4x1xi8>
%83 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%84 = vector.insert %83, %82 [3] : vector<1xi8> into vector<4x1xi8>
%85 = arith.extsi %84 : vector<4x1xi8> to vector<4x1xi32>
%86 = vector.extract %85[0, 0] : vector<4x1xi32>
%87 = vector.insert %86, %cst_1 [0] : i32 into vector<4xi32>
%88 = vector.extract %85[1, 0] : vector<4x1xi32>
%89 = vector.insert %88, %87 [1] : i32 into vector<4xi32>
%90 = vector.extract %85[2, 0] : vector<4x1xi32>
%91 = vector.insert %90, %89 [2] : i32 into vector<4xi32>
%92 = vector.extract %85[3, 0] : vector<4x1xi32>
%93 = vector.insert %92, %91 [3] : i32 into vector<4xi32>
%94 = vector.extract %arg10[2] : vector<4xi32>
%95 = arith.muli %31, %93 : vector<4xi32>
%96 = vector.reduction <add>, %95, %94 : vector<4xi32> into i32
%97 = vector.insert %96, %cst_0 [0] : i32 into vector<1xi32>
%98 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%99 = vector.insert %98, %cst [0] : vector<1xi8> into vector<4x1xi8>
%100 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%101 = vector.insert %100, %99 [1] : vector<1xi8> into vector<4x1xi8>
%102 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%103 = vector.insert %102, %101 [2] : vector<1xi8> into vector<4x1xi8>
%104 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%105 = vector.insert %104, %103 [3] : vector<1xi8> into vector<4x1xi8>
%106 = arith.extsi %105 : vector<4x1xi8> to vector<4x1xi32>
%107 = vector.extract %106[0, 0] : vector<4x1xi32>
%108 = vector.insert %107, %cst_1 [0] : i32 into vector<4xi32>
%109 = vector.extract %106[1, 0] : vector<4x1xi32>
%110 = vector.insert %109, %108 [1] : i32 into vector<4xi32>
%111 = vector.extract %106[2, 0] : vector<4x1xi32>
%112 = vector.insert %111, %110 [2] : i32 into vector<4xi32>
%113 = vector.extract %106[3, 0] : vector<4x1xi32>
%114 = vector.insert %113, %112 [3] : i32 into vector<4xi32>
%115 = vector.extract %arg10[3] : vector<4xi32>
%116 = arith.muli %31, %114 : vector<4xi32>
%117 = vector.reduction <add>, %116, %115 : vector<4xi32> into i32
%118 = vector.insert %117, %cst_0 [0] : i32 into vector<1xi32>
%119 = vector.extract %arg9[0] : vector<4xi32>
%120 = arith.muli %32, %51 : vector<4xi32>
%121 = vector.reduction <add>, %120, %119 : vector<4xi32> into i32
%122 = vector.insert %121, %cst_0 [0] : i32 into vector<1xi32>
%123 = vector.extract %arg9[1] : vector<4xi32>
%124 = arith.muli %32, %72 : vector<4xi32>
%125 = vector.reduction <add>, %124, %123 : vector<4xi32> into i32
%126 = vector.insert %125, %cst_0 [0] : i32 into vector<1xi32>
%127 = vector.extract %arg9[2] : vector<4xi32>
%128 = arith.muli %32, %93 : vector<4xi32>
%129 = vector.reduction <add>, %128, %127 : vector<4xi32> into i32
%130 = vector.insert %129, %cst_0 [0] : i32 into vector<1xi32>
%131 = vector.extract %arg9[3] : vector<4xi32>
%132 = arith.muli %32, %114 : vector<4xi32>
%133 = vector.reduction <add>, %132, %131 : vector<4xi32> into i32
%134 = vector.insert %133, %cst_0 [0] : i32 into vector<1xi32>
%135 = vector.extract %arg8[0] : vector<4xi32>
%136 = arith.muli %33, %51 : vector<4xi32>
%137 = vector.reduction <add>, %136, %135 : vector<4xi32> into i32
%138 = vector.insert %137, %cst_0 [0] : i32 into vector<1xi32>
%139 = vector.extract %arg8[1] : vector<4xi32>
%140 = arith.muli %33, %72 : vector<4xi32>
%141 = vector.reduction <add>, %140, %139 : vector<4xi32> into i32
%142 = vector.insert %141, %cst_0 [0] : i32 into vector<1xi32>
%143 = vector.extract %arg8[2] : vector<4xi32>
%144 = arith.muli %33, %93 : vector<4xi32>
%145 = vector.reduction <add>, %144, %143 : vector<4xi32> into i32
%146 = vector.insert %145, %cst_0 [0] : i32 into vector<1xi32>
%147 = vector.extract %arg8[3] : vector<4xi32>
%148 = arith.muli %33, %114 : vector<4xi32>
%149 = vector.reduction <add>, %148, %147 : vector<4xi32> into i32
%150 = vector.insert %149, %cst_0 [0] : i32 into vector<1xi32>
%151 = vector.extract %arg7[0] : vector<4xi32>
%152 = arith.muli %34, %51 : vector<4xi32>
%153 = vector.reduction <add>, %152, %151 : vector<4xi32> into i32
%154 = vector.insert %153, %cst_0 [0] : i32 into vector<1xi32>
%155 = vector.extract %arg7[1] : vector<4xi32>
%156 = arith.muli %34, %72 : vector<4xi32>
%157 = vector.reduction <add>, %156, %155 : vector<4xi32> into i32
%158 = vector.insert %157, %cst_0 [0] : i32 into vector<1xi32>
%159 = vector.extract %arg7[2] : vector<4xi32>
%160 = arith.muli %34, %93 : vector<4xi32>
%161 = vector.reduction <add>, %160, %159 : vector<4xi32> into i32
%162 = vector.insert %161, %cst_0 [0] : i32 into vector<1xi32>
%163 = vector.extract %arg7[3] : vector<4xi32>
%164 = arith.muli %34, %114 : vector<4xi32>
%165 = vector.reduction <add>, %164, %163 : vector<4xi32> into i32
%166 = vector.insert %165, %cst_0 [0] : i32 into vector<1xi32>
%167 = vector.insert_strided_slice %55, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%168 = vector.insert_strided_slice %76, %167 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%169 = vector.insert_strided_slice %97, %168 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%170 = vector.insert_strided_slice %118, %169 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%171 = vector.insert_strided_slice %122, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%172 = vector.insert_strided_slice %126, %171 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%173 = vector.insert_strided_slice %130, %172 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%174 = vector.insert_strided_slice %134, %173 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%175 = vector.insert_strided_slice %138, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%176 = vector.insert_strided_slice %142, %175 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%177 = vector.insert_strided_slice %146, %176 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%178 = vector.insert_strided_slice %150, %177 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
%179 = vector.insert_strided_slice %154, %cst_1 {offsets = [0], strides = [1]} : vector<1xi32> into vector<4xi32>
%180 = vector.insert_strided_slice %158, %179 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
%181 = vector.insert_strided_slice %162, %180 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
%182 = vector.insert_strided_slice %166, %181 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
scf.yield %182, %178, %174, %170 : vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>
}
%16 = vector.transfer_write %15#3, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%17 = vector.transfer_write %15#2, %16[%c1, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%18 = vector.transfer_write %15#1, %17[%c2, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%19 = vector.transfer_write %15#0, %18[%c3, %c0] {in_bounds = [true]} : vector<4xi32>, tensor<4x4xi32>
%inserted_slice = tensor.insert_slice %19 into %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<4x4xi32> into tensor<8x32xi32>
scf.yield %inserted_slice : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %14 : tensor<8x32xi32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : tensor<8x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
}
}
return
}
}
// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
module {
func.func @_main_dispatch_0_matmul_1024x1024x1024() {
%cst = arith.constant dense<0> : vector<4x1xi8>
%cst_0 = arith.constant dense<0> : vector<1xi32>
%cst_1 = arith.constant dense<0> : vector<4xi32>
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c0_i8 = arith.constant 0 : i8
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg0 = %3 to %c1024 step %4 {
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<8x1024xi8>
scf.for %arg1 = %5 to %c1024 step %6 {
%8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [8, 32], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xi32>> -> tensor<8x32xi32>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [1024, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xi8>> -> tensor<1024x32xi8>
%10 = scf.for %arg2 = %c0 to %c8 step %c4 iter_args(%arg3 = %8) -> (tensor<8x32xi32>) {
%11 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
%12 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg2)
%13 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg2)
%14 = scf.for %arg4 = %c0 to %c32 step %c4 iter_args(%arg5 = %arg3) -> (tensor<8x32xi32>) {
%extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [4, 4] [1, 1] : tensor<8x32xi32> to tensor<4x4xi32>
%15:4 = scf.for %arg6 = %c0 to %c1024 step %c4 iter_args(%arg7 = %cst_1, %arg8 = %cst_1, %arg9 = %cst_1, %arg10 = %cst_1) -> (vector<4xi32>, vector<4xi32>, vector<4xi32>, vector<4xi32>) {
%20 = vector.transfer_read %7[%arg2, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%21 = vector.transfer_read %7[%11, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%22 = vector.transfer_read %7[%12, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%23 = vector.transfer_read %7[%13, %arg6], %c0_i8 {in_bounds = [true]} : tensor<8x1024xi8>, vector<4xi8>
%24 = vector.transfer_read %9[%arg6, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%25 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg6)
%26 = vector.transfer_read %9[%25, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%27 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg6)
%28 = vector.transfer_read %9[%27, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%29 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg6)
%30 = vector.transfer_read %9[%29, %arg4], %c0_i8 {in_bounds = [true]} : tensor<1024x32xi8>, vector<4xi8>
%31 = arith.extsi %20 : vector<4xi8> to vector<4xi32>
%32 = arith.extsi %21 : vector<4xi8> to vector<4xi32>
%33 = arith.extsi %22 : vector<4xi8> to vector<4xi32>
%34 = arith.extsi %23 : vector<4xi8> to vector<4xi32>
%35 = vector.extract_strided_slice %24 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%36 = vector.insert %35, %cst [0] : vector<1xi8> into vector<4x1xi8>
%37 = vector.extract_strided_slice %26 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%38 = vector.insert %37, %36 [1] : vector<1xi8> into vector<4x1xi8>
%39 = vector.extract_strided_slice %28 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%40 = vector.insert %39, %38 [2] : vector<1xi8> into vector<4x1xi8>
%41 = vector.extract_strided_slice %30 {offsets = [0], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%42 = vector.insert %41, %40 [3] : vector<1xi8> into vector<4x1xi8>
%43 = arith.extsi %42 : vector<4x1xi8> to vector<4x1xi32>
%44 = vector.extract %43[0, 0] : vector<4x1xi32>
%45 = vector.insert %44, %cst_1 [0] : i32 into vector<4xi32>
%46 = vector.extract %43[1, 0] : vector<4x1xi32>
%47 = vector.insert %46, %45 [1] : i32 into vector<4xi32>
%48 = vector.extract %43[2, 0] : vector<4x1xi32>
%49 = vector.insert %48, %47 [2] : i32 into vector<4xi32>
%50 = vector.extract %43[3, 0] : vector<4x1xi32>
%51 = vector.insert %50, %49 [3] : i32 into vector<4xi32>
%52 = vector.extract %arg10[0] : vector<4xi32>
%53 = arith.muli %31, %51 : vector<4xi32>
%54 = vector.reduction <add>, %53, %52 : vector<4xi32> into i32
%55 = vector.insert %54, %cst_0 [0] : i32 into vector<1xi32>
%56 = vector.extract_strided_slice %24 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%57 = vector.insert %56, %cst [0] : vector<1xi8> into vector<4x1xi8>
%58 = vector.extract_strided_slice %26 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%59 = vector.insert %58, %57 [1] : vector<1xi8> into vector<4x1xi8>
%60 = vector.extract_strided_slice %28 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%61 = vector.insert %60, %59 [2] : vector<1xi8> into vector<4x1xi8>
%62 = vector.extract_strided_slice %30 {offsets = [1], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%63 = vector.insert %62, %61 [3] : vector<1xi8> into vector<4x1xi8>
%64 = arith.extsi %63 : vector<4x1xi8> to vector<4x1xi32>
%65 = vector.extract %64[0, 0] : vector<4x1xi32>
%66 = vector.insert %65, %cst_1 [0] : i32 into vector<4xi32>
%67 = vector.extract %64[1, 0] : vector<4x1xi32>
%68 = vector.insert %67, %66 [1] : i32 into vector<4xi32>
%69 = vector.extract %64[2, 0] : vector<4x1xi32>
%70 = vector.insert %69, %68 [2] : i32 into vector<4xi32>
%71 = vector.extract %64[3, 0] : vector<4x1xi32>
%72 = vector.insert %71, %70 [3] : i32 into vector<4xi32>
%73 = vector.extract %arg10[1] : vector<4xi32>
%74 = arith.muli %31, %72 : vector<4xi32>
%75 = vector.reduction <add>, %74, %73 : vector<4xi32> into i32
%76 = vector.insert %75, %cst_0 [0] : i32 into vector<1xi32>
%77 = vector.extract_strided_slice %24 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%78 = vector.insert %77, %cst [0] : vector<1xi8> into vector<4x1xi8>
%79 = vector.extract_strided_slice %26 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%80 = vector.insert %79, %78 [1] : vector<1xi8> into vector<4x1xi8>
%81 = vector.extract_strided_slice %28 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%82 = vector.insert %81, %80 [2] : vector<1xi8> into vector<4x1xi8>
%83 = vector.extract_strided_slice %30 {offsets = [2], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%84 = vector.insert %83, %82 [3] : vector<1xi8> into vector<4x1xi8>
%85 = arith.extsi %84 : vector<4x1xi8> to vector<4x1xi32>
%86 = vector.extract %85[0, 0] : vector<4x1xi32>
%87 = vector.insert %86, %cst_1 [0] : i32 into vector<4xi32>
%88 = vector.extract %85[1, 0] : vector<4x1xi32>
%89 = vector.insert %88, %87 [1] : i32 into vector<4xi32>
%90 = vector.extract %85[2, 0] : vector<4x1xi32>
%91 = vector.insert %90, %89 [2] : i32 into vector<4xi32>
%92 = vector.extract %85[3, 0] : vector<4x1xi32>
%93 = vector.insert %92, %91 [3] : i32 into vector<4xi32>
%94 = vector.extract %arg10[2] : vector<4xi32>
%95 = arith.muli %31, %93 : vector<4xi32>
%96 = vector.reduction <add>, %95, %94 : vector<4xi32> into i32
%97 = vector.insert %96, %cst_0 [0] : i32 into vector<1xi32>
%98 = vector.extract_strided_slice %24 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%99 = vector.insert %98, %cst [0] : vector<1xi8> into vector<4x1xi8>
%100 = vector.extract_strided_slice %26 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%101 = vector.insert %100, %99 [1] : vector<1xi8> into vector<4x1xi8>
%102 = vector.extract_strided_slice %28 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%103 = vector.insert %102, %101 [2] : vector<1xi8> into vector<4x1xi8>
%104 = vector.extract_strided_slice %30 {offsets = [3], sizes = [1], strides = [1]} : vector<4xi8> to vector<1xi8>
%105 = vector.insert %104, %103 [3] : vector<1xi8> into vector<4x1xi8>
%106 = arith.extsi %105 : vector<4x1xi8> to vector<4x1xi32>
%107 = vector.extract %106[0, 0] : vector<4x1xi32>
%108 = vector.insert %107, %cst_1 [0] : i32 into vector<4xi32>
%109 = vector.extract %106[1, 0] : vector<4x1xi32>
%110 = vector.insert %109, %108 [1] : i32 into vector<4xi32>
%111 = vector.extract %106[2, 0] : vector<4x1xi32>
%112 = vector.insert %111, %110 [2] : i32 into vector<4xi32>
%113 = vector.extract %106[3, 0] : vector<4x1xi32>
%114 = vector.insert %113, %112 [3] : i32 into vector<4xi32>
%115 = vector.extract %arg10[3] : vector<4xi32>
%116 = arith.muli %31, %114 : vector<4xi32>
%117 = vector.reduction <add>, %116, %115 : vector<4xi32> into i32
%118 = vector.insert %117, %cst_0 [0] : i32 into vector<1xi32>
%119 = vector.extract %arg9[0] : vector<4xi32>
%120 = arith.muli %32, %51 : vector<4xi32>
%121 = vector.reduction <add>, %120, %119 : vector<4xi32> into i32
%122 = vector.insert %121, %cst_0 [0] : i32 into vector<1xi32>
%123 = vector.extract %arg9[1] : vector<4xi32>
%124 = arith.muli %32, %72 : vector<4xi32>
%125 = vector.reduction <add>, %124, %123 : vector<4xi32> into i32
%126 = vector.insert %125, %cst_0 [0] : i32 into vector<1xi32>
%127 = vector.extract %arg9[2] : vector<4xi32>
%128 = arith.muli %32, %93 : vector<4xi32>
%129 = vector.reduction <add>, %128, %127 : vector<4xi32> into i32
%130 = vector.insert %129, %cst_0 [0] : i32 into vector<1xi32>
%131 = vector.extract %arg9[3] : vector<4xi32>
%132 = arith.muli %32, %114 : vector<4xi32>
%133 = vector.reduction <add>, %132, %131 : vector<4xi32> into i32
%134 = vector.insert %133, %cst_0 [0] : i32 into vector<1xi32>
%135 = vector.extract %arg8[0] : vector<4xi32>
%136 = arith.muli %33, %51 : vector<4xi32>
%137 = vector.reduction <add>, %136, %135 : vector<4xi32> into i32
%138 = vector.insert %137, %cst_0 [0] : i32 into vector<1xi32>
%139 = vector.extract %arg8[1] : vector<4xi32>
%140 = arith.muli %33, %72 : vector<4xi32>
%141 = vector.reduction <add>, %140, %139 : vector<4xi32> into i32
%142 = vector.insert %141, %cst_0 [0] : i32 into vector<1xi32>
%143 = vector.extract %arg8[2] : vector<4xi32>
%144 = arith.muli %33, %93 : vector<4xi32>
%145 = vector.reduction <add>, %144, %143 : vector<4xi32> into i32
%146 = vector.insert %145, %cst_0 [0] : i32 into vector<1xi32>
%147 = vector.extract %arg8[3] : vector<4xi32>
%148 =
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment