Skip to content

Instantly share code, notes, and snippets.

@pashu123
Created January 31, 2025 20:06
Show Gist options
  • Save pashu123/b5d1f56319ad1d68b8209cbc2f70477f to your computer and use it in GitHub Desktop.
Save pashu123/b5d1f56319ad1d68b8209cbc2f70477f to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
// -----// IR Dump Before AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
module {
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
}
// -----// IR Dump Before BindSymbolicShapesPass (torch-iree-bind-symbolic-shapes) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After BindSymbolicShapesPass (torch-iree-bind-symbolic-shapes) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before SetStrictSymbolicShapesPass (torch-iree-set-strict-symbolic-shapes) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After SetStrictSymbolicShapesPass (torch-iree-set-strict-symbolic-shapes) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before BitCastQuantTensorPass (torch-iree-bitcast-quant-tensor) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After BitCastQuantTensorPass (torch-iree-bitcast-quant-tensor) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before ReduceOpVariants (torch-reduce-op-variants) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After ReduceOpVariants (torch-reduce-op-variants) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before ConvertCustomQuantOp (torch-convert-custom-quant-op) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After ConvertCustomQuantOp (torch-convert-custom-quant-op) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before DecomposeComplexOps (torch-decompose-complex-ops) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After DecomposeComplexOps (torch-decompose-complex-ops) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%false = torch.constant.bool false
%none = torch.constant.none
%0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before FuseQuantizedOps (torch-fuse-quantized-ops) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%false = torch.constant.bool false
%none = torch.constant.none
%0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After FuseQuantizedOps (torch-fuse-quantized-ops) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%false = torch.constant.bool false
%none = torch.constant.none
%0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%false = torch.constant.bool false
%none = torch.constant.none
%0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%false = torch.constant.bool false
%none = torch.constant.none
%0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before ScalarizeShapes (torch-scalarize-shapes) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%false = torch.constant.bool false
%none = torch.constant.none
%0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After ScalarizeShapes (torch-scalarize-shapes) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%false = torch.constant.bool false
%none = torch.constant.none
%0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before ConvertTorchToTMTensor (convert-torch-to-tmtensor) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%false = torch.constant.bool false
%none = torch.constant.none
%0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After ConvertTorchToTMTensor (convert-torch-to-tmtensor) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%false = torch.constant.bool false
%none = torch.constant.none
%0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before ConvertTMTensorToLinalgExtPass (torch-iree-tm-tensor-to-linalg-ext) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%false = torch.constant.bool false
%none = torch.constant.none
%0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After ConvertTMTensorToLinalgExtPass (torch-iree-tm-tensor-to-linalg-ext) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%false = torch.constant.bool false
%none = torch.constant.none
%0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before ConvertTorchToTensor (convert-torch-to-tensor) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%false = torch.constant.bool false
%none = torch.constant.none
%0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After ConvertTorchToTensor (convert-torch-to-tensor) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%false = torch.constant.bool false
%none = torch.constant.none
%0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before ConvertTorchToLinalg (convert-torch-to-linalg) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%false = torch.constant.bool false
%none = torch.constant.none
%0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
%3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
return %3 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After ConvertTorchToLinalg (convert-torch-to-linalg) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%false = torch.constant.bool false
%none = torch.constant.none
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c1_0 = arith.constant 1 : index
%c4096 = arith.constant 4096 : index
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%cast = tensor.cast %3 : tensor<32x4096xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
%c1_1 = arith.constant 1 : index
%c0_2 = arith.constant 0 : index
%c4096_3 = arith.constant 4096 : index
%c1_4 = arith.constant 1 : index
%c4096_5 = arith.constant 4096 : index
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%cast_6 = tensor.cast %5 : tensor<4096x4096xf8E4M3FNUZ> to tensor<4096x4096xf8E4M3FNUZ>
%c0_7 = arith.constant 0 : index
%dim = tensor.dim %cast, %c0_7 : tensor<32x4096xf8E4M3FNUZ>
%c1_8 = arith.constant 1 : index
%dim_9 = tensor.dim %cast_6, %c1_8 : tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty(%dim, %dim_9) : tensor<?x?xf32>
%cst = arith.constant 0.000000e+00 : f32
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
%8 = linalg.matmul ins(%cast, %cast_6 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
%c1_10 = arith.constant 1 : index
%c0_11 = arith.constant 0 : index
%dim_12 = tensor.dim %8, %c0_11 : tensor<?x?xf32>
%c1_13 = arith.constant 1 : index
%dim_14 = tensor.dim %8, %c1_13 : tensor<?x?xf32>
%9 = tensor.empty(%dim_12, %dim_14) : tensor<?x?xf8E4M3FNUZ>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<?x?xf8E4M3FNUZ>
%cast_15 = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
%c1_16 = arith.constant 1 : index
%c0_17 = arith.constant 0 : index
%c32_18 = arith.constant 32 : index
%c1_19 = arith.constant 1 : index
%c4096_20 = arith.constant 4096 : index
%11 = tensor.empty() : tensor<32x4096xbf16>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast_15 : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%cast_21 = tensor.cast %12 : tensor<32x4096xbf16> to tensor<32x4096xbf16>
%13 = torch_c.from_builtin_tensor %cast_21 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %13 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before CSE (cse) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%int26 = torch.constant.int 26
%int15 = torch.constant.int 15
%false = torch.constant.bool false
%none = torch.constant.none
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c1_0 = arith.constant 1 : index
%c4096 = arith.constant 4096 : index
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%cast = tensor.cast %3 : tensor<32x4096xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
%c1_1 = arith.constant 1 : index
%c0_2 = arith.constant 0 : index
%c4096_3 = arith.constant 4096 : index
%c1_4 = arith.constant 1 : index
%c4096_5 = arith.constant 4096 : index
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%cast_6 = tensor.cast %5 : tensor<4096x4096xf8E4M3FNUZ> to tensor<4096x4096xf8E4M3FNUZ>
%c0_7 = arith.constant 0 : index
%dim = tensor.dim %cast, %c0_7 : tensor<32x4096xf8E4M3FNUZ>
%c1_8 = arith.constant 1 : index
%dim_9 = tensor.dim %cast_6, %c1_8 : tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty(%dim, %dim_9) : tensor<?x?xf32>
%cst = arith.constant 0.000000e+00 : f32
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
%8 = linalg.matmul ins(%cast, %cast_6 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
%c1_10 = arith.constant 1 : index
%c0_11 = arith.constant 0 : index
%dim_12 = tensor.dim %8, %c0_11 : tensor<?x?xf32>
%c1_13 = arith.constant 1 : index
%dim_14 = tensor.dim %8, %c1_13 : tensor<?x?xf32>
%9 = tensor.empty(%dim_12, %dim_14) : tensor<?x?xf8E4M3FNUZ>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<?x?xf8E4M3FNUZ>
%cast_15 = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
%c1_16 = arith.constant 1 : index
%c0_17 = arith.constant 0 : index
%c32_18 = arith.constant 32 : index
%c1_19 = arith.constant 1 : index
%c4096_20 = arith.constant 4096 : index
%11 = tensor.empty() : tensor<32x4096xbf16>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast_15 : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%cast_21 = tensor.cast %12 : tensor<32x4096xbf16> to tensor<32x4096xbf16>
%13 = torch_c.from_builtin_tensor %cast_21 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %13 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After CSE (cse) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%cast = tensor.cast %3 : tensor<32x4096xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%cast_0 = tensor.cast %5 : tensor<4096x4096xf8E4M3FNUZ> to tensor<4096x4096xf8E4M3FNUZ>
%c0 = arith.constant 0 : index
%dim = tensor.dim %cast, %c0 : tensor<32x4096xf8E4M3FNUZ>
%c1 = arith.constant 1 : index
%dim_1 = tensor.dim %cast_0, %c1 : tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty(%dim, %dim_1) : tensor<?x?xf32>
%cst = arith.constant 0.000000e+00 : f32
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
%8 = linalg.matmul ins(%cast, %cast_0 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
%dim_2 = tensor.dim %8, %c0 : tensor<?x?xf32>
%dim_3 = tensor.dim %8, %c1 : tensor<?x?xf32>
%9 = tensor.empty(%dim_2, %dim_3) : tensor<?x?xf8E4M3FNUZ>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<?x?xf8E4M3FNUZ>
%cast_4 = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
%11 = tensor.empty() : tensor<32x4096xbf16>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast_4 : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%cast_5 = tensor.cast %12 : tensor<32x4096xbf16> to tensor<32x4096xbf16>
%13 = torch_c.from_builtin_tensor %cast_5 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %13 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before ConvertTorchToSCF (convert-torch-to-scf) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%cast = tensor.cast %3 : tensor<32x4096xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%cast_0 = tensor.cast %5 : tensor<4096x4096xf8E4M3FNUZ> to tensor<4096x4096xf8E4M3FNUZ>
%c0 = arith.constant 0 : index
%dim = tensor.dim %cast, %c0 : tensor<32x4096xf8E4M3FNUZ>
%c1 = arith.constant 1 : index
%dim_1 = tensor.dim %cast_0, %c1 : tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty(%dim, %dim_1) : tensor<?x?xf32>
%cst = arith.constant 0.000000e+00 : f32
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
%8 = linalg.matmul ins(%cast, %cast_0 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
%dim_2 = tensor.dim %8, %c0 : tensor<?x?xf32>
%dim_3 = tensor.dim %8, %c1 : tensor<?x?xf32>
%9 = tensor.empty(%dim_2, %dim_3) : tensor<?x?xf8E4M3FNUZ>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<?x?xf8E4M3FNUZ>
%cast_4 = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
%11 = tensor.empty() : tensor<32x4096xbf16>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast_4 : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%cast_5 = tensor.cast %12 : tensor<32x4096xbf16> to tensor<32x4096xbf16>
%13 = torch_c.from_builtin_tensor %cast_5 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %13 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After ConvertTorchToSCF (convert-torch-to-scf) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c1 = arith.constant 1 : index
%c4096 = arith.constant 4096 : index
%6 = tensor.empty(%c32, %c4096) : tensor<?x?xf32>
%cst = arith.constant 0.000000e+00 : f32
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
%dim = tensor.dim %8, %c0 : tensor<?x?xf32>
%dim_0 = tensor.dim %8, %c1 : tensor<?x?xf32>
%9 = tensor.empty(%dim, %dim_0) : tensor<?x?xf8E4M3FNUZ>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<?x?xf8E4M3FNUZ>
%cast = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
%11 = tensor.empty() : tensor<32x4096xbf16>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %13 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before ConvertTorchToArith (convert-torch-to-arith) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c1 = arith.constant 1 : index
%c4096 = arith.constant 4096 : index
%6 = tensor.empty(%c32, %c4096) : tensor<?x?xf32>
%cst = arith.constant 0.000000e+00 : f32
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
%dim = tensor.dim %8, %c0 : tensor<?x?xf32>
%dim_0 = tensor.dim %8, %c1 : tensor<?x?xf32>
%9 = tensor.empty(%dim, %dim_0) : tensor<?x?xf8E4M3FNUZ>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<?x?xf8E4M3FNUZ>
%cast = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
%11 = tensor.empty() : tensor<32x4096xbf16>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %13 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After ConvertTorchToArith (convert-torch-to-arith) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c1 = arith.constant 1 : index
%c4096 = arith.constant 4096 : index
%6 = tensor.empty(%c32, %c4096) : tensor<?x?xf32>
%cst = arith.constant 0.000000e+00 : f32
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
%dim = tensor.dim %8, %c0 : tensor<?x?xf32>
%dim_0 = tensor.dim %8, %c1 : tensor<?x?xf32>
%9 = tensor.empty(%dim, %dim_0) : tensor<?x?xf8E4M3FNUZ>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<?x?xf8E4M3FNUZ>
%cast = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
%11 = tensor.empty() : tensor<32x4096xbf16>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %13 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before ConvertTorchConversionToMLProgram (convert-torch-conversion-to-mlprogram) //----- //
module {
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c1 = arith.constant 1 : index
%c4096 = arith.constant 4096 : index
%6 = tensor.empty(%c32, %c4096) : tensor<?x?xf32>
%cst = arith.constant 0.000000e+00 : f32
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
%dim = tensor.dim %8, %c0 : tensor<?x?xf32>
%dim_0 = tensor.dim %8, %c1 : tensor<?x?xf32>
%9 = tensor.empty(%dim, %dim_0) : tensor<?x?xf8E4M3FNUZ>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<?x?xf8E4M3FNUZ>
%cast = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
%11 = tensor.empty() : tensor<32x4096xbf16>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %13 : !torch.vtensor<[32,4096],bf16>
}
}
// -----// IR Dump After ConvertTorchConversionToMLProgram (convert-torch-conversion-to-mlprogram) //----- //
module {
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c1 = arith.constant 1 : index
%c4096 = arith.constant 4096 : index
%6 = tensor.empty(%c32, %c4096) : tensor<?x?xf32>
%cst = arith.constant 0.000000e+00 : f32
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
%dim = tensor.dim %8, %c0 : tensor<?x?xf32>
%dim_0 = tensor.dim %8, %c1 : tensor<?x?xf32>
%9 = tensor.empty(%dim, %dim_0) : tensor<?x?xf8E4M3FNUZ>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<?x?xf8E4M3FNUZ>
%cast = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
%11 = tensor.empty() : tensor<32x4096xbf16>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %13 : !torch.vtensor<[32,4096],bf16>
}
}
// -----// IR Dump Before ExpandOps (memref-expand) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c1 = arith.constant 1 : index
%c4096 = arith.constant 4096 : index
%6 = tensor.empty(%c32, %c4096) : tensor<?x?xf32>
%cst = arith.constant 0.000000e+00 : f32
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
%dim = tensor.dim %8, %c0 : tensor<?x?xf32>
%dim_0 = tensor.dim %8, %c1 : tensor<?x?xf32>
%9 = tensor.empty(%dim, %dim_0) : tensor<?x?xf8E4M3FNUZ>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<?x?xf8E4M3FNUZ>
%cast = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
%11 = tensor.empty() : tensor<32x4096xbf16>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %13 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After ExpandOps (memref-expand) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c1 = arith.constant 1 : index
%c4096 = arith.constant 4096 : index
%6 = tensor.empty(%c32, %c4096) : tensor<?x?xf32>
%cst = arith.constant 0.000000e+00 : f32
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
%dim = tensor.dim %8, %c0 : tensor<?x?xf32>
%dim_0 = tensor.dim %8, %c1 : tensor<?x?xf32>
%9 = tensor.empty(%dim, %dim_0) : tensor<?x?xf8E4M3FNUZ>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<?x?xf8E4M3FNUZ>
%cast = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
%11 = tensor.empty() : tensor<32x4096xbf16>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %13 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c1 = arith.constant 1 : index
%c4096 = arith.constant 4096 : index
%6 = tensor.empty(%c32, %c4096) : tensor<?x?xf32>
%cst = arith.constant 0.000000e+00 : f32
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
%dim = tensor.dim %8, %c0 : tensor<?x?xf32>
%dim_0 = tensor.dim %8, %c1 : tensor<?x?xf32>
%9 = tensor.empty(%dim, %dim_0) : tensor<?x?xf8E4M3FNUZ>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<?x?xf8E4M3FNUZ>
%cast = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
%11 = tensor.empty() : tensor<32x4096xbf16>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %13 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%cst = arith.constant 0.000000e+00 : f32
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%11 = tensor.empty() : tensor<32x4096xbf16>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %13 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%cst = arith.constant 0.000000e+00 : f32
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%11 = tensor.empty() : tensor<32x4096xbf16>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %13 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%cst = arith.constant 0.000000e+00 : f32
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%11 = tensor.empty() : tensor<32x4096xbf16>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %13 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before CSE (cse) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%cst = arith.constant 0.000000e+00 : f32
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%11 = tensor.empty() : tensor<32x4096xbf16>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %13 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After CSE (cse) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%cst = arith.constant 0.000000e+00 : f32
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%13 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %13 : bf16
} -> tensor<32x4096xbf16>
%12 = torch_c.from_builtin_tensor %11 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %12 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump Before Inliner (inline) //----- //
module {
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%cst = arith.constant 0.000000e+00 : f32
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%13 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %13 : bf16
} -> tensor<32x4096xbf16>
%12 = torch_c.from_builtin_tensor %11 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %12 : !torch.vtensor<[32,4096],bf16>
}
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%cst = arith.constant 0.000000e+00 : f32
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%13 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %13 : bf16
} -> tensor<32x4096xbf16>
%12 = torch_c.from_builtin_tensor %11 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %12 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%cst = arith.constant 0.000000e+00 : f32
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%13 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %13 : bf16
} -> tensor<32x4096xbf16>
%12 = torch_c.from_builtin_tensor %11 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %12 : !torch.vtensor<[32,4096],bf16>
}
// -----// IR Dump After Inliner (inline) //----- //
module {
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%cst = arith.constant 0.000000e+00 : f32
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%13 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %13 : bf16
} -> tensor<32x4096xbf16>
%12 = torch_c.from_builtin_tensor %11 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %12 : !torch.vtensor<[32,4096],bf16>
}
}
// -----// IR Dump Before FuncConversionPass (torch-iree-func-conversion) //----- //
module {
func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%cst = arith.constant 0.000000e+00 : f32
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%13 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %13 : bf16
} -> tensor<32x4096xbf16>
%12 = torch_c.from_builtin_tensor %11 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
return %12 : !torch.vtensor<[32,4096],bf16>
}
}
// -----// IR Dump After FuncConversionPass (torch-iree-func-conversion) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = torch_c.from_builtin_tensor %0 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
%2 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%3 = torch_c.from_builtin_tensor %2 : tensor<4096x4096xbf16> -> !torch.vtensor<[4096,4096],bf16>
%cst = arith.constant 0.000000e+00 : f32
%4 = torch_c.to_builtin_tensor %3 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%5 = torch_c.to_builtin_tensor %1 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%6 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<32x4096xbf16>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%19 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %19 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%8 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<4096x4096xbf16>) outs(%8 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%19 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %19 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xf32>
%11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%12 = linalg.matmul ins(%7, %9 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<32x4096xf32>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%19 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %19 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%14 = tensor.empty() : tensor<32x4096xbf16>
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<32x4096xf8E4M3FNUZ>) outs(%14 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%19 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %19 : bf16
} -> tensor<32x4096xbf16>
%16 = torch_c.from_builtin_tensor %15 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
%17 = hal.tensor.barrier join(%15 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%18 = hal.tensor.export %17 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %18 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = torch_c.from_builtin_tensor %0 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
%2 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%3 = torch_c.from_builtin_tensor %2 : tensor<4096x4096xbf16> -> !torch.vtensor<[4096,4096],bf16>
%cst = arith.constant 0.000000e+00 : f32
%4 = torch_c.to_builtin_tensor %3 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%5 = torch_c.to_builtin_tensor %1 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%6 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<32x4096xbf16>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%19 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %19 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%8 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<4096x4096xbf16>) outs(%8 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%19 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %19 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xf32>
%11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%12 = linalg.matmul ins(%7, %9 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<32x4096xf32>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%19 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %19 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%14 = tensor.empty() : tensor<32x4096xbf16>
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<32x4096xf8E4M3FNUZ>) outs(%14 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%19 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %19 : bf16
} -> tensor<32x4096xbf16>
%16 = torch_c.from_builtin_tensor %15 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
%17 = hal.tensor.barrier join(%15 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%18 = hal.tensor.export %17 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %18 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = torch_c.from_builtin_tensor %0 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
%2 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%3 = torch_c.from_builtin_tensor %2 : tensor<4096x4096xbf16> -> !torch.vtensor<[4096,4096],bf16>
%4 = torch_c.to_builtin_tensor %3 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%5 = torch_c.to_builtin_tensor %1 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%6 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<32x4096xbf16>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%18 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %18 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%8 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<4096x4096xbf16>) outs(%8 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%18 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %18 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xf32>
%11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%12 = linalg.matmul ins(%7, %9 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<32x4096xf32>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%18 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %18 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%14 = tensor.empty() : tensor<32x4096xbf16>
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<32x4096xf8E4M3FNUZ>) outs(%14 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%18 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %18 : bf16
} -> tensor<32x4096xbf16>
%16 = hal.tensor.barrier join(%15 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%17 = hal.tensor.export %16 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %17 : !hal.buffer_view
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before SymbolDCE (symbol-dce) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = torch_c.from_builtin_tensor %0 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
%2 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%3 = torch_c.from_builtin_tensor %2 : tensor<4096x4096xbf16> -> !torch.vtensor<[4096,4096],bf16>
%4 = torch_c.to_builtin_tensor %3 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%5 = torch_c.to_builtin_tensor %1 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%6 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<32x4096xbf16>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%18 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %18 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%8 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<4096x4096xbf16>) outs(%8 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%18 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %18 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xf32>
%11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%12 = linalg.matmul ins(%7, %9 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<32x4096xf32>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%18 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %18 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%14 = tensor.empty() : tensor<32x4096xbf16>
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<32x4096xf8E4M3FNUZ>) outs(%14 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%18 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %18 : bf16
} -> tensor<32x4096xbf16>
%16 = hal.tensor.barrier join(%15 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%17 = hal.tensor.export %16 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %17 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = torch_c.from_builtin_tensor %0 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
%2 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%3 = torch_c.from_builtin_tensor %2 : tensor<4096x4096xbf16> -> !torch.vtensor<[4096,4096],bf16>
%4 = torch_c.to_builtin_tensor %3 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%5 = torch_c.to_builtin_tensor %1 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%6 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<32x4096xbf16>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%18 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %18 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%8 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<4096x4096xbf16>) outs(%8 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%18 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %18 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xf32>
%11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%12 = linalg.matmul ins(%7, %9 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<32x4096xf32>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%18 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %18 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%14 = tensor.empty() : tensor<32x4096xbf16>
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<32x4096xf8E4M3FNUZ>) outs(%14 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%18 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %18 : bf16
} -> tensor<32x4096xbf16>
%16 = hal.tensor.barrier join(%15 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%17 = hal.tensor.export %16 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %17 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before FinalizingBackendTypeConversion (torch-finalizing-backend-type-conversion) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = torch_c.from_builtin_tensor %0 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
%2 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%3 = torch_c.from_builtin_tensor %2 : tensor<4096x4096xbf16> -> !torch.vtensor<[4096,4096],bf16>
%4 = torch_c.to_builtin_tensor %3 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
%5 = torch_c.to_builtin_tensor %1 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
%6 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<32x4096xbf16>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%18 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %18 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%8 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<4096x4096xbf16>) outs(%8 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%18 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %18 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xf32>
%11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%12 = linalg.matmul ins(%7, %9 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<32x4096xf32>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%18 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %18 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%14 = tensor.empty() : tensor<32x4096xbf16>
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<32x4096xf8E4M3FNUZ>) outs(%14 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%18 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %18 : bf16
} -> tensor<32x4096xbf16>
%16 = hal.tensor.barrier join(%15 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%17 = hal.tensor.export %16 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %17 : !hal.buffer_view
}
// -----// IR Dump After FinalizingBackendTypeConversion (torch-finalizing-backend-type-conversion) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before FinalizingBackendTypeConversion (torch-finalizing-backend-type-conversion) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After FinalizingBackendTypeConversion (torch-finalizing-backend-type-conversion) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before IREEImportPublicPass (iree-import-public) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before ImportMLProgramPass (iree-import-ml-program) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After ImportMLProgramPass (iree-import-ml-program) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before SanitizeModuleNamesPass (iree-sanitize-module-names) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After SanitizeModuleNamesPass (iree-sanitize-module-names) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before Inliner (inline) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After Inliner (inline) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before SymbolDCE (symbol-dce) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
module {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
module attributes {hal.device.targets = [#hal.device.alias<"hip"> : !hal.device]} {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
module attributes {hal.device.targets = [#hal.device.alias<"hip"> : !hal.device]} {
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.alias<"hip"> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.alias<"hip"> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.alias<"hip"> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.alias<"hip"> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before VerifyDevicesPass (iree-hal-verify-devices) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before GeneralizeLinalgMatMulPass (iree-preprocessing-generalize-linalg-matmul-experimental) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After GeneralizeLinalgMatMulPass (iree-preprocessing-generalize-linalg-matmul-experimental) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before StripDebugOpsPass (iree-util-strip-debug-ops) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After StripDebugOpsPass (iree-util-strip-debug-ops) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before GeneralizeLinalgMatMulPass (iree-preprocessing-generalize-linalg-matmul-experimental) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After GeneralizeLinalgMatMulPass (iree-preprocessing-generalize-linalg-matmul-experimental) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before StripDebugOpsPass (iree-util-strip-debug-ops) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After StripDebugOpsPass (iree-util-strip-debug-ops) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before DecomposeConcatPass (iree-global-opt-decompose-concat) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before DecomposeConcatPass (iree-global-opt-decompose-concat) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before PropagateLinalgTransposePass (iree-global-opt-propagate-linalg-transpose) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After PropagateLinalgTransposePass (iree-global-opt-propagate-linalg-transpose) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before ApplyPatternsPass (iree-util-apply-patterns) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before PropagateLinalgTransposePass (iree-global-opt-propagate-linalg-transpose) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After PropagateLinalgTransposePass (iree-global-opt-propagate-linalg-transpose) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before ApplyPatternsPass (iree-util-apply-patterns) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before FoldGlobalsPass (iree-util-fold-globals) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before IPOPass (iree-util-ipo) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After IPOPass (iree-util-ipo) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before JitGlobalsPass (iree-consteval-jit-globals) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After JitGlobalsPass (iree-consteval-jit-globals) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before ApplyPatternsPass (iree-util-apply-patterns) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before ApplyPatternsPass (iree-util-apply-patterns) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before FoldGlobalsPass (iree-util-fold-globals) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before FuseGlobalsPass (iree-util-fuse-globals) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before IPOPass (iree-util-ipo) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After IPOPass (iree-util-ipo) //----- //
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump Before ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: f32, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%10 = tensor.empty() : tensor<32x4096xbf16>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
^bb0(%in: f8E4M3FNUZ, %out: bf16):
%14 = arith.extf %in : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %13 : !hal.buffer_view
}
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump Before FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%13 = arith.extf %in : f8E4M3FNUZ to f32
%14 = arith.extf %in_0 : f8E4M3FNUZ to f32
%15 = arith.mulf %13, %14 : f32
%16 = arith.addf %out, %15 : f32
linalg.yield %16 : f32
} -> tensor<32x4096xf32>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%13 = arith.truncf %in : f32 to f8E4M3FNUZ
%14 = arith.extf %13 : f8E4M3FNUZ to bf16
linalg.yield %14 : bf16
} -> tensor<32x4096xbf16>
%11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %12 : !hal.buffer_view
}
// -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = flow.dispatch.region -> (tensor<32x4096xf8E4M3FNUZ>) {
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
flow.return %12 : tensor<32x4096xf8E4M3FNUZ>
}
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = flow.dispatch.region -> (tensor<4096x4096xf8E4M3FNUZ>) {
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
flow.return %12 : tensor<4096x4096xf8E4M3FNUZ>
}
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = tensor.empty() : tensor<32x4096xbf16>
%9 = flow.dispatch.region -> (tensor<32x4096xbf16>) {
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<32x4096xf32>) outs(%8 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
%15 = arith.extf %14 : f8E4M3FNUZ to bf16
linalg.yield %15 : bf16
} -> tensor<32x4096xbf16>
flow.return %13 : tensor<32x4096xbf16>
}
%10 = hal.tensor.barrier join(%9 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%11 = hal.tensor.export %10 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
// -----// IR Dump Before CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%3 = flow.dispatch.region -> (tensor<32x4096xf8E4M3FNUZ>) {
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
flow.return %12 : tensor<32x4096xf8E4M3FNUZ>
}
%4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%5 = flow.dispatch.region -> (tensor<4096x4096xf8E4M3FNUZ>) {
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%13 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %13 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
flow.return %12 : tensor<4096x4096xf8E4M3FNUZ>
}
%6 = tensor.empty() : tensor<32x4096xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%8 = tensor.empty() : tensor<32x4096xbf16>
%9 = flow.dispatch.region -> (tensor<32x4096xbf16>) {
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_0 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<32x4096xf32>) outs(%8 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
%15 = arith.extf %14 : f8E4M3FNUZ to bf16
linalg.yield %15 : bf16
} -> tensor<32x4096xbf16>
flow.return %13 : tensor<32x4096xbf16>
}
%10 = hal.tensor.barrier join(%9 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%11 = hal.tensor.export %10 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %11 : !hal.buffer_view
}
// -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = flow.dispatch.region -> (tensor<32x4096xf8E4M3FNUZ>) {
%7 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%7 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%9 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %9 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
flow.return %8 : tensor<32x4096xf8E4M3FNUZ>
}
%3 = flow.dispatch.region -> (tensor<4096x4096xf8E4M3FNUZ>) {
%7 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%7 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%9 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %9 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
flow.return %8 : tensor<4096x4096xf8E4M3FNUZ>
}
%4 = flow.dispatch.region -> (tensor<32x4096xbf16>) {
%7 = tensor.empty() : tensor<32x4096xbf16>
%8 = tensor.empty() : tensor<32x4096xf32>
%cst = arith.constant 0.000000e+00 : f32
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2, %3 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%9 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%12 = arith.extf %in : f8E4M3FNUZ to f32
%13 = arith.extf %in_0 : f8E4M3FNUZ to f32
%14 = arith.mulf %12, %13 : f32
%15 = arith.addf %out, %14 : f32
linalg.yield %15 : f32
} -> tensor<32x4096xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<32x4096xf32>) outs(%7 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%12 = arith.truncf %in : f32 to f8E4M3FNUZ
%13 = arith.extf %12 : f8E4M3FNUZ to bf16
linalg.yield %13 : bf16
} -> tensor<32x4096xbf16>
flow.return %11 : tensor<32x4096xbf16>
}
%5 = hal.tensor.barrier join(%4 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%6 = hal.tensor.export %5 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
// -----// IR Dump Before CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = flow.dispatch.region -> (tensor<32x4096xf8E4M3FNUZ>) {
%7 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%7 : tensor<32x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%9 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %9 : f8E4M3FNUZ
} -> tensor<32x4096xf8E4M3FNUZ>
flow.return %8 : tensor<32x4096xf8E4M3FNUZ>
}
%3 = flow.dispatch.region -> (tensor<4096x4096xf8E4M3FNUZ>) {
%7 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%7 : tensor<4096x4096xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%9 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %9 : f8E4M3FNUZ
} -> tensor<4096x4096xf8E4M3FNUZ>
flow.return %8 : tensor<4096x4096xf8E4M3FNUZ>
}
%4 = flow.dispatch.region -> (tensor<32x4096xbf16>) {
%7 = tensor.empty() : tensor<32x4096xbf16>
%8 = tensor.empty() : tensor<32x4096xf32>
%cst = arith.constant 0.000000e+00 : f32
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2, %3 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%9 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%12 = arith.extf %in : f8E4M3FNUZ to f32
%13 = arith.extf %in_0 : f8E4M3FNUZ to f32
%14 = arith.mulf %12, %13 : f32
%15 = arith.addf %out, %14 : f32
linalg.yield %15 : f32
} -> tensor<32x4096xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<32x4096xf32>) outs(%7 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%12 = arith.truncf %in : f32 to f8E4M3FNUZ
%13 = arith.extf %12 : f8E4M3FNUZ to bf16
linalg.yield %13 : bf16
} -> tensor<32x4096xbf16>
flow.return %11 : tensor<32x4096xbf16>
}
%5 = hal.tensor.barrier join(%4 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%6 = hal.tensor.export %5 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
// -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%collapsed = tensor.collapse_shape %0 [[0, 1]] : tensor<32x4096xbf16> into tensor<131072xbf16>
%2 = flow.dispatch.region -> (tensor<131072xf8E4M3FNUZ>) {
%7 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed : tensor<131072xbf16>) outs(%7 : tensor<131072xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%9 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %9 : f8E4M3FNUZ
} -> tensor<131072xf8E4M3FNUZ>
%expanded_2 = tensor.expand_shape %8 [[0, 1]] output_shape [32, 4096] : tensor<131072xf8E4M3FNUZ> into tensor<32x4096xf8E4M3FNUZ>
flow.return %8 : tensor<131072xf8E4M3FNUZ>
}
%expanded = tensor.expand_shape %2 [[0, 1]] output_shape [32, 4096] : tensor<131072xf8E4M3FNUZ> into tensor<32x4096xf8E4M3FNUZ>
%collapsed_0 = tensor.collapse_shape %1 [[0, 1]] : tensor<4096x4096xbf16> into tensor<16777216xbf16>
%3 = flow.dispatch.region -> (tensor<16777216xf8E4M3FNUZ>) {
%7 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed_0 : tensor<16777216xbf16>) outs(%7 : tensor<16777216xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%9 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %9 : f8E4M3FNUZ
} -> tensor<16777216xf8E4M3FNUZ>
%expanded_2 = tensor.expand_shape %8 [[0, 1]] output_shape [4096, 4096] : tensor<16777216xf8E4M3FNUZ> into tensor<4096x4096xf8E4M3FNUZ>
flow.return %8 : tensor<16777216xf8E4M3FNUZ>
}
%expanded_1 = tensor.expand_shape %3 [[0, 1]] output_shape [4096, 4096] : tensor<16777216xf8E4M3FNUZ> into tensor<4096x4096xf8E4M3FNUZ>
%4 = flow.dispatch.region -> (tensor<32x4096xbf16>) {
%7 = tensor.empty() : tensor<32x4096xbf16>
%8 = tensor.empty() : tensor<32x4096xf32>
%cst = arith.constant 0.000000e+00 : f32
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%expanded, %expanded_1 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%9 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_2: f8E4M3FNUZ, %out: f32):
%12 = arith.extf %in : f8E4M3FNUZ to f32
%13 = arith.extf %in_2 : f8E4M3FNUZ to f32
%14 = arith.mulf %12, %13 : f32
%15 = arith.addf %out, %14 : f32
linalg.yield %15 : f32
} -> tensor<32x4096xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<32x4096xf32>) outs(%7 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%12 = arith.truncf %in : f32 to f8E4M3FNUZ
%13 = arith.extf %12 : f8E4M3FNUZ to bf16
linalg.yield %13 : bf16
} -> tensor<32x4096xbf16>
flow.return %11 : tensor<32x4096xbf16>
}
%5 = hal.tensor.barrier join(%4 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%6 = hal.tensor.export %5 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
// -----// IR Dump Before ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%collapsed = tensor.collapse_shape %0 [[0, 1]] : tensor<32x4096xbf16> into tensor<131072xbf16>
%2 = flow.dispatch.region -> (tensor<131072xf8E4M3FNUZ>) {
%7 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed : tensor<131072xbf16>) outs(%7 : tensor<131072xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%9 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %9 : f8E4M3FNUZ
} -> tensor<131072xf8E4M3FNUZ>
%expanded_2 = tensor.expand_shape %8 [[0, 1]] output_shape [32, 4096] : tensor<131072xf8E4M3FNUZ> into tensor<32x4096xf8E4M3FNUZ>
flow.return %8 : tensor<131072xf8E4M3FNUZ>
}
%expanded = tensor.expand_shape %2 [[0, 1]] output_shape [32, 4096] : tensor<131072xf8E4M3FNUZ> into tensor<32x4096xf8E4M3FNUZ>
%collapsed_0 = tensor.collapse_shape %1 [[0, 1]] : tensor<4096x4096xbf16> into tensor<16777216xbf16>
%3 = flow.dispatch.region -> (tensor<16777216xf8E4M3FNUZ>) {
%7 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed_0 : tensor<16777216xbf16>) outs(%7 : tensor<16777216xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%9 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %9 : f8E4M3FNUZ
} -> tensor<16777216xf8E4M3FNUZ>
%expanded_2 = tensor.expand_shape %8 [[0, 1]] output_shape [4096, 4096] : tensor<16777216xf8E4M3FNUZ> into tensor<4096x4096xf8E4M3FNUZ>
flow.return %8 : tensor<16777216xf8E4M3FNUZ>
}
%expanded_1 = tensor.expand_shape %3 [[0, 1]] output_shape [4096, 4096] : tensor<16777216xf8E4M3FNUZ> into tensor<4096x4096xf8E4M3FNUZ>
%4 = flow.dispatch.region -> (tensor<32x4096xbf16>) {
%7 = tensor.empty() : tensor<32x4096xbf16>
%8 = tensor.empty() : tensor<32x4096xf32>
%cst = arith.constant 0.000000e+00 : f32
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%expanded, %expanded_1 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%9 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_2: f8E4M3FNUZ, %out: f32):
%12 = arith.extf %in : f8E4M3FNUZ to f32
%13 = arith.extf %in_2 : f8E4M3FNUZ to f32
%14 = arith.mulf %12, %13 : f32
%15 = arith.addf %out, %14 : f32
linalg.yield %15 : f32
} -> tensor<32x4096xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<32x4096xf32>) outs(%7 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%12 = arith.truncf %in : f32 to f8E4M3FNUZ
%13 = arith.extf %12 : f8E4M3FNUZ to bf16
linalg.yield %13 : bf16
} -> tensor<32x4096xbf16>
flow.return %11 : tensor<32x4096xbf16>
}
%5 = hal.tensor.barrier join(%4 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%6 = hal.tensor.export %5 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
// -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%collapsed = tensor.collapse_shape %0 [[0, 1]] : tensor<32x4096xbf16> into tensor<131072xbf16>
%2 = flow.dispatch.workgroups(%collapsed) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
%7 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
%8 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
%9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%7 : tensor<131072xbf16>) outs(%8 : tensor<131072xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%10 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %10 : f8E4M3FNUZ
} -> tensor<131072xf8E4M3FNUZ>
%expanded_2 = tensor.expand_shape %9 [[0, 1]] output_shape [32, 4096] : tensor<131072xf8E4M3FNUZ> into tensor<32x4096xf8E4M3FNUZ>
flow.dispatch.tensor.store %9, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
flow.return
}
%expanded = tensor.expand_shape %2 [[0, 1]] output_shape [32, 4096] : tensor<131072xf8E4M3FNUZ> into tensor<32x4096xf8E4M3FNUZ>
%collapsed_0 = tensor.collapse_shape %1 [[0, 1]] : tensor<4096x4096xbf16> into tensor<16777216xbf16>
%3 = flow.dispatch.workgroups(%collapsed_0) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
%7 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
%8 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
%9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%7 : tensor<16777216xbf16>) outs(%8 : tensor<16777216xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%10 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %10 : f8E4M3FNUZ
} -> tensor<16777216xf8E4M3FNUZ>
%expanded_2 = tensor.expand_shape %9 [[0, 1]] output_shape [4096, 4096] : tensor<16777216xf8E4M3FNUZ> into tensor<4096x4096xf8E4M3FNUZ>
flow.dispatch.tensor.store %9, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
flow.return
}
%expanded_1 = tensor.expand_shape %3 [[0, 1]] output_shape [4096, 4096] : tensor<16777216xf8E4M3FNUZ> into tensor<4096x4096xf8E4M3FNUZ>
%4 = flow.dispatch.workgroups(%expanded, %expanded_1) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
%7 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
%8 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = tensor.empty() : tensor<32x4096xf32>
%cst = arith.constant 0.000000e+00 : f32
%11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %8 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_2: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_2 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
%15 = arith.extf %14 : f8E4M3FNUZ to bf16
linalg.yield %15 : bf16
} -> tensor<32x4096xbf16>
flow.dispatch.tensor.store %13, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
flow.return
}
%5 = hal.tensor.barrier join(%4 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%6 = hal.tensor.export %5 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
// -----// IR Dump Before ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%collapsed = tensor.collapse_shape %0 [[0, 1]] : tensor<32x4096xbf16> into tensor<131072xbf16>
%2 = flow.dispatch.workgroups(%collapsed) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
%7 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
%8 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
%9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%7 : tensor<131072xbf16>) outs(%8 : tensor<131072xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%10 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %10 : f8E4M3FNUZ
} -> tensor<131072xf8E4M3FNUZ>
%expanded_2 = tensor.expand_shape %9 [[0, 1]] output_shape [32, 4096] : tensor<131072xf8E4M3FNUZ> into tensor<32x4096xf8E4M3FNUZ>
flow.dispatch.tensor.store %9, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
flow.return
}
%expanded = tensor.expand_shape %2 [[0, 1]] output_shape [32, 4096] : tensor<131072xf8E4M3FNUZ> into tensor<32x4096xf8E4M3FNUZ>
%collapsed_0 = tensor.collapse_shape %1 [[0, 1]] : tensor<4096x4096xbf16> into tensor<16777216xbf16>
%3 = flow.dispatch.workgroups(%collapsed_0) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
%7 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
%8 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
%9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%7 : tensor<16777216xbf16>) outs(%8 : tensor<16777216xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%10 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %10 : f8E4M3FNUZ
} -> tensor<16777216xf8E4M3FNUZ>
%expanded_2 = tensor.expand_shape %9 [[0, 1]] output_shape [4096, 4096] : tensor<16777216xf8E4M3FNUZ> into tensor<4096x4096xf8E4M3FNUZ>
flow.dispatch.tensor.store %9, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
flow.return
}
%expanded_1 = tensor.expand_shape %3 [[0, 1]] output_shape [4096, 4096] : tensor<16777216xf8E4M3FNUZ> into tensor<4096x4096xf8E4M3FNUZ>
%4 = flow.dispatch.workgroups(%expanded, %expanded_1) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
%7 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
%8 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
%9 = tensor.empty() : tensor<32x4096xbf16>
%10 = tensor.empty() : tensor<32x4096xf32>
%cst = arith.constant 0.000000e+00 : f32
%11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %8 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_2: f8E4M3FNUZ, %out: f32):
%14 = arith.extf %in : f8E4M3FNUZ to f32
%15 = arith.extf %in_2 : f8E4M3FNUZ to f32
%16 = arith.mulf %14, %15 : f32
%17 = arith.addf %out, %16 : f32
linalg.yield %17 : f32
} -> tensor<32x4096xf32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%14 = arith.truncf %in : f32 to f8E4M3FNUZ
%15 = arith.extf %14 : f8E4M3FNUZ to bf16
linalg.yield %15 : bf16
} -> tensor<32x4096xbf16>
flow.dispatch.tensor.store %13, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
flow.return
}
%5 = hal.tensor.barrier join(%4 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%6 = hal.tensor.export %5 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %6 : !hal.buffer_view
}
// -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = flow.tensor.reshape %0 : tensor<32x4096xbf16> -> tensor<131072xbf16>
%3 = flow.dispatch.workgroups(%2) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
%11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
%12 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<131072xbf16>) outs(%12 : tensor<131072xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<131072xf8E4M3FNUZ>
flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
flow.return
}
%4 = flow.tensor.reshape %3 : tensor<131072xf8E4M3FNUZ> -> tensor<32x4096xf8E4M3FNUZ>
%5 = flow.tensor.reshape %1 : tensor<4096x4096xbf16> -> tensor<16777216xbf16>
%6 = flow.dispatch.workgroups(%5) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
%11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
%12 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<16777216xbf16>) outs(%12 : tensor<16777216xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<16777216xf8E4M3FNUZ>
flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
flow.return
}
%7 = flow.tensor.reshape %6 : tensor<16777216xf8E4M3FNUZ> -> tensor<4096x4096xf8E4M3FNUZ>
%8 = flow.dispatch.workgroups(%4, %7) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
%cst = arith.constant 0.000000e+00 : f32
%11 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
%12 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
%13 = tensor.empty() : tensor<32x4096xbf16>
%14 = tensor.empty() : tensor<32x4096xf32>
%15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%15 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%18 = arith.extf %in : f8E4M3FNUZ to f32
%19 = arith.extf %in_0 : f8E4M3FNUZ to f32
%20 = arith.mulf %18, %19 : f32
%21 = arith.addf %out, %20 : f32
linalg.yield %21 : f32
} -> tensor<32x4096xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<32x4096xf32>) outs(%13 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%18 = arith.truncf %in : f32 to f8E4M3FNUZ
%19 = arith.extf %18 : f8E4M3FNUZ to bf16
linalg.yield %19 : bf16
} -> tensor<32x4096xbf16>
flow.dispatch.tensor.store %17, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
flow.return
}
%9 = hal.tensor.barrier join(%8 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%10 = hal.tensor.export %9 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %10 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = flow.tensor.reshape %0 : tensor<32x4096xbf16> -> tensor<131072xbf16>
%3 = flow.dispatch.workgroups(%2) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
%11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
%12 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<131072xbf16>) outs(%12 : tensor<131072xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<131072xf8E4M3FNUZ>
flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
flow.return
}
%4 = flow.tensor.reshape %3 : tensor<131072xf8E4M3FNUZ> -> tensor<32x4096xf8E4M3FNUZ>
%5 = flow.tensor.reshape %1 : tensor<4096x4096xbf16> -> tensor<16777216xbf16>
%6 = flow.dispatch.workgroups(%5) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
%11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
%12 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<16777216xbf16>) outs(%12 : tensor<16777216xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<16777216xf8E4M3FNUZ>
flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
flow.return
}
%7 = flow.tensor.reshape %6 : tensor<16777216xf8E4M3FNUZ> -> tensor<4096x4096xf8E4M3FNUZ>
%8 = flow.dispatch.workgroups(%4, %7) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
%cst = arith.constant 0.000000e+00 : f32
%11 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
%12 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
%13 = tensor.empty() : tensor<32x4096xbf16>
%14 = tensor.empty() : tensor<32x4096xf32>
%15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%15 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%18 = arith.extf %in : f8E4M3FNUZ to f32
%19 = arith.extf %in_0 : f8E4M3FNUZ to f32
%20 = arith.mulf %18, %19 : f32
%21 = arith.addf %out, %20 : f32
linalg.yield %21 : f32
} -> tensor<32x4096xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<32x4096xf32>) outs(%13 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%18 = arith.truncf %in : f32 to f8E4M3FNUZ
%19 = arith.extf %18 : f8E4M3FNUZ to bf16
linalg.yield %19 : bf16
} -> tensor<32x4096xbf16>
flow.dispatch.tensor.store %17, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
flow.return
}
%9 = hal.tensor.barrier join(%8 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%10 = hal.tensor.export %9 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %10 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = flow.tensor.reshape %0 : tensor<32x4096xbf16> -> tensor<131072xbf16>
%3 = flow.dispatch.workgroups(%2) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
%11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
%12 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<131072xbf16>) outs(%12 : tensor<131072xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<131072xf8E4M3FNUZ>
flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
flow.return
}
%4 = flow.tensor.reshape %3 : tensor<131072xf8E4M3FNUZ> -> tensor<32x4096xf8E4M3FNUZ>
%5 = flow.tensor.reshape %1 : tensor<4096x4096xbf16> -> tensor<16777216xbf16>
%6 = flow.dispatch.workgroups(%5) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
%11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
%12 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<16777216xbf16>) outs(%12 : tensor<16777216xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<16777216xf8E4M3FNUZ>
flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
flow.return
}
%7 = flow.tensor.reshape %6 : tensor<16777216xf8E4M3FNUZ> -> tensor<4096x4096xf8E4M3FNUZ>
%8 = flow.dispatch.workgroups(%4, %7) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
%cst = arith.constant 0.000000e+00 : f32
%11 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
%12 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
%13 = tensor.empty() : tensor<32x4096xbf16>
%14 = tensor.empty() : tensor<32x4096xf32>
%15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%15 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%18 = arith.extf %in : f8E4M3FNUZ to f32
%19 = arith.extf %in_0 : f8E4M3FNUZ to f32
%20 = arith.mulf %18, %19 : f32
%21 = arith.addf %out, %20 : f32
linalg.yield %21 : f32
} -> tensor<32x4096xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<32x4096xf32>) outs(%13 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%18 = arith.truncf %in : f32 to f8E4M3FNUZ
%19 = arith.extf %18 : f8E4M3FNUZ to bf16
linalg.yield %19 : bf16
} -> tensor<32x4096xbf16>
flow.dispatch.tensor.store %17, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
flow.return
}
%9 = hal.tensor.barrier join(%8 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%10 = hal.tensor.export %9 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %10 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = flow.tensor.reshape %0 : tensor<32x4096xbf16> -> tensor<131072xbf16>
%3 = flow.dispatch.workgroups(%2) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
%11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
%12 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<131072xbf16>) outs(%12 : tensor<131072xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<131072xf8E4M3FNUZ>
flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
flow.return
}
%4 = flow.tensor.reshape %3 : tensor<131072xf8E4M3FNUZ> -> tensor<32x4096xf8E4M3FNUZ>
%5 = flow.tensor.reshape %1 : tensor<4096x4096xbf16> -> tensor<16777216xbf16>
%6 = flow.dispatch.workgroups(%5) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
%11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
%12 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<16777216xbf16>) outs(%12 : tensor<16777216xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<16777216xf8E4M3FNUZ>
flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
flow.return
}
%7 = flow.tensor.reshape %6 : tensor<16777216xf8E4M3FNUZ> -> tensor<4096x4096xf8E4M3FNUZ>
%8 = flow.dispatch.workgroups(%4, %7) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
%cst = arith.constant 0.000000e+00 : f32
%11 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
%12 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
%13 = tensor.empty() : tensor<32x4096xbf16>
%14 = tensor.empty() : tensor<32x4096xf32>
%15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%15 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%18 = arith.extf %in : f8E4M3FNUZ to f32
%19 = arith.extf %in_0 : f8E4M3FNUZ to f32
%20 = arith.mulf %18, %19 : f32
%21 = arith.addf %out, %20 : f32
linalg.yield %21 : f32
} -> tensor<32x4096xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<32x4096xf32>) outs(%13 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%18 = arith.truncf %in : f32 to f8E4M3FNUZ
%19 = arith.extf %18 : f8E4M3FNUZ to bf16
linalg.yield %19 : bf16
} -> tensor<32x4096xbf16>
flow.dispatch.tensor.store %17, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
flow.return
}
%9 = hal.tensor.barrier join(%8 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%10 = hal.tensor.export %9 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %10 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = flow.tensor.reshape %0 : tensor<32x4096xbf16> -> tensor<131072xbf16>
%3 = flow.dispatch.workgroups(%2) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
%11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
%12 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<131072xbf16>) outs(%12 : tensor<131072xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<131072xf8E4M3FNUZ>
flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
flow.return
}
%4 = flow.tensor.reshape %3 : tensor<131072xf8E4M3FNUZ> -> tensor<32x4096xf8E4M3FNUZ>
%5 = flow.tensor.reshape %1 : tensor<4096x4096xbf16> -> tensor<16777216xbf16>
%6 = flow.dispatch.workgroups(%5) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
%11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
%12 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<16777216xbf16>) outs(%12 : tensor<16777216xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<16777216xf8E4M3FNUZ>
flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
flow.return
}
%7 = flow.tensor.reshape %6 : tensor<16777216xf8E4M3FNUZ> -> tensor<4096x4096xf8E4M3FNUZ>
%8 = flow.dispatch.workgroups(%4, %7) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
%cst = arith.constant 0.000000e+00 : f32
%11 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
%12 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
%13 = tensor.empty() : tensor<32x4096xbf16>
%14 = tensor.empty() : tensor<32x4096xf32>
%15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%15 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%18 = arith.extf %in : f8E4M3FNUZ to f32
%19 = arith.extf %in_0 : f8E4M3FNUZ to f32
%20 = arith.mulf %18, %19 : f32
%21 = arith.addf %out, %20 : f32
linalg.yield %21 : f32
} -> tensor<32x4096xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<32x4096xf32>) outs(%13 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%18 = arith.truncf %in : f32 to f8E4M3FNUZ
%19 = arith.extf %18 : f8E4M3FNUZ to bf16
linalg.yield %19 : bf16
} -> tensor<32x4096xbf16>
flow.dispatch.tensor.store %17, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
flow.return
}
%9 = hal.tensor.barrier join(%8 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%10 = hal.tensor.export %9 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %10 : !hal.buffer_view
}
// -----// IR Dump Before MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = flow.tensor.reshape %0 : tensor<32x4096xbf16> -> tensor<131072xbf16>
%3 = flow.dispatch.workgroups(%2) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
%11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
%12 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<131072xbf16>) outs(%12 : tensor<131072xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<131072xf8E4M3FNUZ>
flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
flow.return
}
%4 = flow.tensor.reshape %3 : tensor<131072xf8E4M3FNUZ> -> tensor<32x4096xf8E4M3FNUZ>
%5 = flow.tensor.reshape %1 : tensor<4096x4096xbf16> -> tensor<16777216xbf16>
%6 = flow.dispatch.workgroups(%5) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
%11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
%12 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<16777216xbf16>) outs(%12 : tensor<16777216xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<16777216xf8E4M3FNUZ>
flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
flow.return
}
%7 = flow.tensor.reshape %6 : tensor<16777216xf8E4M3FNUZ> -> tensor<4096x4096xf8E4M3FNUZ>
%8 = flow.dispatch.workgroups(%4, %7) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
%cst = arith.constant 0.000000e+00 : f32
%11 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
%12 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
%13 = tensor.empty() : tensor<32x4096xbf16>
%14 = tensor.empty() : tensor<32x4096xf32>
%15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%15 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%18 = arith.extf %in : f8E4M3FNUZ to f32
%19 = arith.extf %in_0 : f8E4M3FNUZ to f32
%20 = arith.mulf %18, %19 : f32
%21 = arith.addf %out, %20 : f32
linalg.yield %21 : f32
} -> tensor<32x4096xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<32x4096xf32>) outs(%13 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%18 = arith.truncf %in : f32 to f8E4M3FNUZ
%19 = arith.extf %18 : f8E4M3FNUZ to bf16
linalg.yield %19 : bf16
} -> tensor<32x4096xbf16>
flow.dispatch.tensor.store %17, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
flow.return
}
%9 = hal.tensor.barrier join(%8 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%10 = hal.tensor.export %9 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %10 : !hal.buffer_view
}
// -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- //
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = flow.tensor.reshape %0 : tensor<32x4096xbf16> -> tensor<131072xbf16>
%3 = flow.dispatch.workgroups(%2) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
%11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
%12 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<131072xbf16>) outs(%12 : tensor<131072xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<131072xf8E4M3FNUZ>
flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%4 = flow.tensor.reshape %3 : tensor<131072xf8E4M3FNUZ> -> tensor<32x4096xf8E4M3FNUZ>
%5 = flow.tensor.reshape %1 : tensor<4096x4096xbf16> -> tensor<16777216xbf16>
%6 = flow.dispatch.workgroups(%5) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
%11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
%12 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<16777216xbf16>) outs(%12 : tensor<16777216xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<16777216xf8E4M3FNUZ>
flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%7 = flow.tensor.reshape %6 : tensor<16777216xf8E4M3FNUZ> -> tensor<4096x4096xf8E4M3FNUZ>
%8 = flow.dispatch.workgroups(%4, %7) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
%cst = arith.constant 0.000000e+00 : f32
%11 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
%12 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
%13 = tensor.empty() : tensor<32x4096xbf16>
%14 = tensor.empty() : tensor<32x4096xf32>
%15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%15 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%18 = arith.extf %in : f8E4M3FNUZ to f32
%19 = arith.extf %in_0 : f8E4M3FNUZ to f32
%20 = arith.mulf %18, %19 : f32
%21 = arith.addf %out, %20 : f32
linalg.yield %21 : f32
} -> tensor<32x4096xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<32x4096xf32>) outs(%13 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%18 = arith.truncf %in : f32 to f8E4M3FNUZ
%19 = arith.extf %18 : f8E4M3FNUZ to bf16
linalg.yield %19 : bf16
} -> tensor<32x4096xbf16>
flow.dispatch.tensor.store %17, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%9 = hal.tensor.barrier join(%8 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%10 = hal.tensor.export %9 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %10 : !hal.buffer_view
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CSE (cse) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- //
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
// -----// IR Dump Before VerifyInputLegalityPass (iree-verify-input-legality) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = flow.tensor.reshape %0 : tensor<32x4096xbf16> -> tensor<131072xbf16>
%3 = flow.dispatch.workgroups(%2) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
%11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
%12 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<131072xbf16>) outs(%12 : tensor<131072xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<131072xf8E4M3FNUZ>
flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%4 = flow.tensor.reshape %3 : tensor<131072xf8E4M3FNUZ> -> tensor<32x4096xf8E4M3FNUZ>
%5 = flow.tensor.reshape %1 : tensor<4096x4096xbf16> -> tensor<16777216xbf16>
%6 = flow.dispatch.workgroups(%5) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
%11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
%12 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<16777216xbf16>) outs(%12 : tensor<16777216xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<16777216xf8E4M3FNUZ>
flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%7 = flow.tensor.reshape %6 : tensor<16777216xf8E4M3FNUZ> -> tensor<4096x4096xf8E4M3FNUZ>
%8 = flow.dispatch.workgroups(%4, %7) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
%cst = arith.constant 0.000000e+00 : f32
%11 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
%12 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
%13 = tensor.empty() : tensor<32x4096xbf16>
%14 = tensor.empty() : tensor<32x4096xf32>
%15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%15 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%18 = arith.extf %in : f8E4M3FNUZ to f32
%19 = arith.extf %in_0 : f8E4M3FNUZ to f32
%20 = arith.mulf %18, %19 : f32
%21 = arith.addf %out, %20 : f32
linalg.yield %21 : f32
} -> tensor<32x4096xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<32x4096xf32>) outs(%13 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%18 = arith.truncf %in : f32 to f8E4M3FNUZ
%19 = arith.extf %18 : f8E4M3FNUZ to bf16
linalg.yield %19 : bf16
} -> tensor<32x4096xbf16>
flow.dispatch.tensor.store %17, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%9 = hal.tensor.barrier join(%8 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%10 = hal.tensor.export %9 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %10 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyInputLegalityPass (iree-verify-input-legality) //----- //
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
%2 = flow.tensor.reshape %0 : tensor<32x4096xbf16> -> tensor<131072xbf16>
%3 = flow.dispatch.workgroups(%2) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
%11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
%12 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<131072xbf16>) outs(%12 : tensor<131072xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<131072xf8E4M3FNUZ>
flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%4 = flow.tensor.reshape %3 : tensor<131072xf8E4M3FNUZ> -> tensor<32x4096xf8E4M3FNUZ>
%5 = flow.tensor.reshape %1 : tensor<4096x4096xbf16> -> tensor<16777216xbf16>
%6 = flow.dispatch.workgroups(%5) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
%11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
%12 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
%13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<16777216xbf16>) outs(%12 : tensor<16777216xf8E4M3FNUZ>) {
^bb0(%in: bf16, %out: f8E4M3FNUZ):
%14 = arith.truncf %in : bf16 to f8E4M3FNUZ
linalg.yield %14 : f8E4M3FNUZ
} -> tensor<16777216xf8E4M3FNUZ>
flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%7 = flow.tensor.reshape %6 : tensor<16777216xf8E4M3FNUZ> -> tensor<4096x4096xf8E4M3FNUZ>
%8 = flow.dispatch.workgroups(%4, %7) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
(%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
%cst = arith.constant 0.000000e+00 : f32
%11 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
%12 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
%13 = tensor.empty() : tensor<32x4096xbf16>
%14 = tensor.empty() : tensor<32x4096xf32>
%15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%15 : tensor<32x4096xf32>) {
^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
%18 = arith.extf %in : f8E4M3FNUZ to f32
%19 = arith.extf %in_0 : f8E4M3FNUZ to f32
%20 = arith.mulf %18, %19 : f32
%21 = arith.addf %out, %20 : f32
linalg.yield %21 : f32
} -> tensor<32x4096xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<32x4096xf32>) outs(%13 : tensor<32x4096xbf16>) {
^bb0(%in: f32, %out: bf16):
%18 = arith.truncf %in : f32 to f8E4M3FNUZ
%19 = arith.extf %18 : f8E4M3FNUZ to bf16
linalg.yield %19 : bf16
} -> tensor<32x4096xbf16>
flow.dispatch.tensor.store %17, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
flow.return
} count() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
flow.return %x, %y, %z : index, index, index
}
%9 = hal.tensor.barrier join(%8 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
%10 = hal.tensor.export %9 : tensor<32x4096xbf16> -> !hal.buffer_view
util.return %10 : !hal.buffer_view
}
util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = util.null : !hal.fence
%c-1_i32 = arith.constant -1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment