pashu123 · January 31, 2025 20:06
diff --git a/full.txt b/full.txt
 // -----// IR Dump Before AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
 module {
  func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> {
    %int26 = torch.constant.int 26
    %int15 = torch.constant.int 15
    %0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
    %1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
    %3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
    return %3 : !torch.vtensor<[32,4096],bf16>
  }
 }

 // -----// IR Dump Before BindSymbolicShapesPass (torch-iree-bind-symbolic-shapes) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After BindSymbolicShapesPass (torch-iree-bind-symbolic-shapes) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before SetStrictSymbolicShapesPass (torch-iree-set-strict-symbolic-shapes) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After SetStrictSymbolicShapesPass (torch-iree-set-strict-symbolic-shapes) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before Canonicalizer (canonicalize) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before BitCastQuantTensorPass (torch-iree-bitcast-quant-tensor) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After BitCastQuantTensorPass (torch-iree-bitcast-quant-tensor) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before ReduceOpVariants (torch-reduce-op-variants) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After ReduceOpVariants (torch-reduce-op-variants) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before ConvertCustomQuantOp (torch-convert-custom-quant-op) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After ConvertCustomQuantOp (torch-convert-custom-quant-op) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before DecomposeComplexOps (torch-decompose-complex-ops) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %0 = torch.prims.convert_element_type %arg0, %int26 : !torch.vtensor<[32,4096],bf16>, !torch.int -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.prims.convert_element_type %arg1, %int26 : !torch.vtensor<[4096,4096],bf16>, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.prims.convert_element_type %2, %int15 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After DecomposeComplexOps (torch-decompose-complex-ops) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before FuseQuantizedOps (torch-fuse-quantized-ops) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After FuseQuantizedOps (torch-fuse-quantized-ops) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before Canonicalizer (canonicalize) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before ScalarizeShapes (torch-scalarize-shapes) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After ScalarizeShapes (torch-scalarize-shapes) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before ConvertTorchToTMTensor (convert-torch-to-tmtensor) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After ConvertTorchToTMTensor (convert-torch-to-tmtensor) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before ConvertTMTensorToLinalgExtPass (torch-iree-tm-tensor-to-linalg-ext) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After ConvertTMTensorToLinalgExtPass (torch-iree-tm-tensor-to-linalg-ext) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before ConvertTorchToTensor (convert-torch-to-tensor) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After ConvertTorchToTensor (convert-torch-to-tensor) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before ConvertTorchToLinalg (convert-torch-to-linalg) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int26, %false, %false, %none : !torch.vtensor<[32,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %1 = torch.aten.to.dtype %arg1, %int26, %false, %false, %none : !torch.vtensor<[4096,4096],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[32,4096],f8E4M3FNUZ>
  %3 = torch.aten.to.dtype %2, %int15, %false, %false, %none : !torch.vtensor<[32,4096],f8E4M3FNUZ>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[32,4096],bf16>
  return %3 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After ConvertTorchToLinalg (convert-torch-to-linalg) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %false = torch.constant.bool false
  %none = torch.constant.none
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c32 = arith.constant 32 : index
  %c1_0 = arith.constant 1 : index
  %c4096 = arith.constant 4096 : index
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %cast = tensor.cast %3 : tensor<32x4096xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
  %c1_1 = arith.constant 1 : index
  %c0_2 = arith.constant 0 : index
  %c4096_3 = arith.constant 4096 : index
  %c1_4 = arith.constant 1 : index
  %c4096_5 = arith.constant 4096 : index
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %cast_6 = tensor.cast %5 : tensor<4096x4096xf8E4M3FNUZ> to tensor<4096x4096xf8E4M3FNUZ>
  %c0_7 = arith.constant 0 : index
  %dim = tensor.dim %cast, %c0_7 : tensor<32x4096xf8E4M3FNUZ>
  %c1_8 = arith.constant 1 : index
  %dim_9 = tensor.dim %cast_6, %c1_8 : tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty(%dim, %dim_9) : tensor<?x?xf32>
  %cst = arith.constant 0.000000e+00 : f32
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %8 = linalg.matmul ins(%cast, %cast_6 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %c1_10 = arith.constant 1 : index
  %c0_11 = arith.constant 0 : index
  %dim_12 = tensor.dim %8, %c0_11 : tensor<?x?xf32>
  %c1_13 = arith.constant 1 : index
  %dim_14 = tensor.dim %8, %c1_13 : tensor<?x?xf32>
  %9 = tensor.empty(%dim_12, %dim_14) : tensor<?x?xf8E4M3FNUZ>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<?x?xf8E4M3FNUZ>
  %cast_15 = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
  %c1_16 = arith.constant 1 : index
  %c0_17 = arith.constant 0 : index
  %c32_18 = arith.constant 32 : index
  %c1_19 = arith.constant 1 : index
  %c4096_20 = arith.constant 4096 : index
  %11 = tensor.empty() : tensor<32x4096xbf16>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast_15 : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %cast_21 = tensor.cast %12 : tensor<32x4096xbf16> to tensor<32x4096xbf16>
  %13 = torch_c.from_builtin_tensor %cast_21 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  return %13 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before CSE (cse) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
  %int26 = torch.constant.int 26
  %int15 = torch.constant.int 15
  %false = torch.constant.bool false
  %none = torch.constant.none
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c32 = arith.constant 32 : index
  %c1_0 = arith.constant 1 : index
  %c4096 = arith.constant 4096 : index
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %cast = tensor.cast %3 : tensor<32x4096xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
  %c1_1 = arith.constant 1 : index
  %c0_2 = arith.constant 0 : index
  %c4096_3 = arith.constant 4096 : index
  %c1_4 = arith.constant 1 : index
  %c4096_5 = arith.constant 4096 : index
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %cast_6 = tensor.cast %5 : tensor<4096x4096xf8E4M3FNUZ> to tensor<4096x4096xf8E4M3FNUZ>
  %c0_7 = arith.constant 0 : index
  %dim = tensor.dim %cast, %c0_7 : tensor<32x4096xf8E4M3FNUZ>
  %c1_8 = arith.constant 1 : index
  %dim_9 = tensor.dim %cast_6, %c1_8 : tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty(%dim, %dim_9) : tensor<?x?xf32>
  %cst = arith.constant 0.000000e+00 : f32
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %8 = linalg.matmul ins(%cast, %cast_6 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %c1_10 = arith.constant 1 : index
  %c0_11 = arith.constant 0 : index
  %dim_12 = tensor.dim %8, %c0_11 : tensor<?x?xf32>
  %c1_13 = arith.constant 1 : index
  %dim_14 = tensor.dim %8, %c1_13 : tensor<?x?xf32>
  %9 = tensor.empty(%dim_12, %dim_14) : tensor<?x?xf8E4M3FNUZ>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<?x?xf8E4M3FNUZ>
  %cast_15 = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
  %c1_16 = arith.constant 1 : index
  %c0_17 = arith.constant 0 : index
  %c32_18 = arith.constant 32 : index
  %c1_19 = arith.constant 1 : index
  %c4096_20 = arith.constant 4096 : index
  %11 = tensor.empty() : tensor<32x4096xbf16>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast_15 : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %cast_21 = tensor.cast %12 : tensor<32x4096xbf16> to tensor<32x4096xbf16>
  %13 = torch_c.from_builtin_tensor %cast_21 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  return %13 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %cast = tensor.cast %3 : tensor<32x4096xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %cast_0 = tensor.cast %5 : tensor<4096x4096xf8E4M3FNUZ> to tensor<4096x4096xf8E4M3FNUZ>
  %c0 = arith.constant 0 : index
  %dim = tensor.dim %cast, %c0 : tensor<32x4096xf8E4M3FNUZ>
  %c1 = arith.constant 1 : index
  %dim_1 = tensor.dim %cast_0, %c1 : tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty(%dim, %dim_1) : tensor<?x?xf32>
  %cst = arith.constant 0.000000e+00 : f32
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %8 = linalg.matmul ins(%cast, %cast_0 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %dim_2 = tensor.dim %8, %c0 : tensor<?x?xf32>
  %dim_3 = tensor.dim %8, %c1 : tensor<?x?xf32>
  %9 = tensor.empty(%dim_2, %dim_3) : tensor<?x?xf8E4M3FNUZ>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<?x?xf8E4M3FNUZ>
  %cast_4 = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
  %11 = tensor.empty() : tensor<32x4096xbf16>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast_4 : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %cast_5 = tensor.cast %12 : tensor<32x4096xbf16> to tensor<32x4096xbf16>
  %13 = torch_c.from_builtin_tensor %cast_5 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  return %13 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before ConvertTorchToSCF (convert-torch-to-scf) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %cast = tensor.cast %3 : tensor<32x4096xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %cast_0 = tensor.cast %5 : tensor<4096x4096xf8E4M3FNUZ> to tensor<4096x4096xf8E4M3FNUZ>
  %c0 = arith.constant 0 : index
  %dim = tensor.dim %cast, %c0 : tensor<32x4096xf8E4M3FNUZ>
  %c1 = arith.constant 1 : index
  %dim_1 = tensor.dim %cast_0, %c1 : tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty(%dim, %dim_1) : tensor<?x?xf32>
  %cst = arith.constant 0.000000e+00 : f32
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %8 = linalg.matmul ins(%cast, %cast_0 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %dim_2 = tensor.dim %8, %c0 : tensor<?x?xf32>
  %dim_3 = tensor.dim %8, %c1 : tensor<?x?xf32>
  %9 = tensor.empty(%dim_2, %dim_3) : tensor<?x?xf8E4M3FNUZ>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<?x?xf8E4M3FNUZ>
  %cast_4 = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
  %11 = tensor.empty() : tensor<32x4096xbf16>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast_4 : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %cast_5 = tensor.cast %12 : tensor<32x4096xbf16> to tensor<32x4096xbf16>
  %13 = torch_c.from_builtin_tensor %cast_5 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  return %13 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After ConvertTorchToSCF (convert-torch-to-scf) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %c0 = arith.constant 0 : index
  %c32 = arith.constant 32 : index
  %c1 = arith.constant 1 : index
  %c4096 = arith.constant 4096 : index
  %6 = tensor.empty(%c32, %c4096) : tensor<?x?xf32>
  %cst = arith.constant 0.000000e+00 : f32
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %dim = tensor.dim %8, %c0 : tensor<?x?xf32>
  %dim_0 = tensor.dim %8, %c1 : tensor<?x?xf32>
  %9 = tensor.empty(%dim, %dim_0) : tensor<?x?xf8E4M3FNUZ>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<?x?xf8E4M3FNUZ>
  %cast = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
  %11 = tensor.empty() : tensor<32x4096xbf16>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  return %13 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before ConvertTorchToArith (convert-torch-to-arith) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %c0 = arith.constant 0 : index
  %c32 = arith.constant 32 : index
  %c1 = arith.constant 1 : index
  %c4096 = arith.constant 4096 : index
  %6 = tensor.empty(%c32, %c4096) : tensor<?x?xf32>
  %cst = arith.constant 0.000000e+00 : f32
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %dim = tensor.dim %8, %c0 : tensor<?x?xf32>
  %dim_0 = tensor.dim %8, %c1 : tensor<?x?xf32>
  %9 = tensor.empty(%dim, %dim_0) : tensor<?x?xf8E4M3FNUZ>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<?x?xf8E4M3FNUZ>
  %cast = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
  %11 = tensor.empty() : tensor<32x4096xbf16>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  return %13 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After ConvertTorchToArith (convert-torch-to-arith) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %c0 = arith.constant 0 : index
  %c32 = arith.constant 32 : index
  %c1 = arith.constant 1 : index
  %c4096 = arith.constant 4096 : index
  %6 = tensor.empty(%c32, %c4096) : tensor<?x?xf32>
  %cst = arith.constant 0.000000e+00 : f32
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %dim = tensor.dim %8, %c0 : tensor<?x?xf32>
  %dim_0 = tensor.dim %8, %c1 : tensor<?x?xf32>
  %9 = tensor.empty(%dim, %dim_0) : tensor<?x?xf8E4M3FNUZ>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<?x?xf8E4M3FNUZ>
  %cast = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
  %11 = tensor.empty() : tensor<32x4096xbf16>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  return %13 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before ConvertTorchConversionToMLProgram (convert-torch-conversion-to-mlprogram) //----- //
 module {
  func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
    %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
    %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %c0 = arith.constant 0 : index
    %c32 = arith.constant 32 : index
    %c1 = arith.constant 1 : index
    %c4096 = arith.constant 4096 : index
    %6 = tensor.empty(%c32, %c4096) : tensor<?x?xf32>
    %cst = arith.constant 0.000000e+00 : f32
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
    %dim = tensor.dim %8, %c0 : tensor<?x?xf32>
    %dim_0 = tensor.dim %8, %c1 : tensor<?x?xf32>
    %9 = tensor.empty(%dim, %dim_0) : tensor<?x?xf8E4M3FNUZ>
    %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<?x?xf8E4M3FNUZ>
    %cast = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
    %11 = tensor.empty() : tensor<32x4096xbf16>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
    return %13 : !torch.vtensor<[32,4096],bf16>
  }
 }

 // -----// IR Dump After ConvertTorchConversionToMLProgram (convert-torch-conversion-to-mlprogram) //----- //
 module {
  func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
    %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
    %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %c0 = arith.constant 0 : index
    %c32 = arith.constant 32 : index
    %c1 = arith.constant 1 : index
    %c4096 = arith.constant 4096 : index
    %6 = tensor.empty(%c32, %c4096) : tensor<?x?xf32>
    %cst = arith.constant 0.000000e+00 : f32
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
    %dim = tensor.dim %8, %c0 : tensor<?x?xf32>
    %dim_0 = tensor.dim %8, %c1 : tensor<?x?xf32>
    %9 = tensor.empty(%dim, %dim_0) : tensor<?x?xf8E4M3FNUZ>
    %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<?x?xf8E4M3FNUZ>
    %cast = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
    %11 = tensor.empty() : tensor<32x4096xbf16>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
    return %13 : !torch.vtensor<[32,4096],bf16>
  }
 }

 // -----// IR Dump Before ExpandOps (memref-expand) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %c0 = arith.constant 0 : index
  %c32 = arith.constant 32 : index
  %c1 = arith.constant 1 : index
  %c4096 = arith.constant 4096 : index
  %6 = tensor.empty(%c32, %c4096) : tensor<?x?xf32>
  %cst = arith.constant 0.000000e+00 : f32
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %dim = tensor.dim %8, %c0 : tensor<?x?xf32>
  %dim_0 = tensor.dim %8, %c1 : tensor<?x?xf32>
  %9 = tensor.empty(%dim, %dim_0) : tensor<?x?xf8E4M3FNUZ>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<?x?xf8E4M3FNUZ>
  %cast = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
  %11 = tensor.empty() : tensor<32x4096xbf16>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  return %13 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After ExpandOps (memref-expand) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %c0 = arith.constant 0 : index
  %c32 = arith.constant 32 : index
  %c1 = arith.constant 1 : index
  %c4096 = arith.constant 4096 : index
  %6 = tensor.empty(%c32, %c4096) : tensor<?x?xf32>
  %cst = arith.constant 0.000000e+00 : f32
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %dim = tensor.dim %8, %c0 : tensor<?x?xf32>
  %dim_0 = tensor.dim %8, %c1 : tensor<?x?xf32>
  %9 = tensor.empty(%dim, %dim_0) : tensor<?x?xf8E4M3FNUZ>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<?x?xf8E4M3FNUZ>
  %cast = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
  %11 = tensor.empty() : tensor<32x4096xbf16>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  return %13 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before Canonicalizer (canonicalize) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %c0 = arith.constant 0 : index
  %c32 = arith.constant 32 : index
  %c1 = arith.constant 1 : index
  %c4096 = arith.constant 4096 : index
  %6 = tensor.empty(%c32, %c4096) : tensor<?x?xf32>
  %cst = arith.constant 0.000000e+00 : f32
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %dim = tensor.dim %8, %c0 : tensor<?x?xf32>
  %dim_0 = tensor.dim %8, %c1 : tensor<?x?xf32>
  %9 = tensor.empty(%dim, %dim_0) : tensor<?x?xf8E4M3FNUZ>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<?x?xf8E4M3FNUZ>
  %cast = tensor.cast %10 : tensor<?x?xf8E4M3FNUZ> to tensor<32x4096xf8E4M3FNUZ>
  %11 = tensor.empty() : tensor<32x4096xbf16>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cast : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  return %13 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %11 = tensor.empty() : tensor<32x4096xbf16>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  return %13 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %11 = tensor.empty() : tensor<32x4096xbf16>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  return %13 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %11 = tensor.empty() : tensor<32x4096xbf16>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  return %13 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before CSE (cse) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %11 = tensor.empty() : tensor<32x4096xbf16>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<32x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %13 = torch_c.from_builtin_tensor %12 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  return %13 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %13 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %13 : bf16
  } -> tensor<32x4096xbf16>
  %12 = torch_c.from_builtin_tensor %11 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  return %12 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump Before Inliner (inline) //----- //
 module {
  func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
    %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %13 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %13 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %13 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %13 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %13 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %13 : bf16
    } -> tensor<32x4096xbf16>
    %12 = torch_c.from_builtin_tensor %11 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
    return %12 : !torch.vtensor<[32,4096],bf16>
  }
 }

 // -----// IR Dump Before Canonicalizer (canonicalize) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %13 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %13 : bf16
  } -> tensor<32x4096xbf16>
  %12 = torch_c.from_builtin_tensor %11 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  return %12 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %13 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %13 : bf16
  } -> tensor<32x4096xbf16>
  %12 = torch_c.from_builtin_tensor %11 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  return %12 : !torch.vtensor<[32,4096],bf16>
 }

 // -----// IR Dump After Inliner (inline) //----- //
 module {
  func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
    %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %13 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %13 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %13 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %13 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %13 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %13 : bf16
    } -> tensor<32x4096xbf16>
    %12 = torch_c.from_builtin_tensor %11 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
    return %12 : !torch.vtensor<[32,4096],bf16>
  }
 }

 // -----// IR Dump Before FuncConversionPass (torch-iree-func-conversion) //----- //
 module {
  func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
    %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %13 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %13 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %13 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %13 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %13 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %13 : bf16
    } -> tensor<32x4096xbf16>
    %12 = torch_c.from_builtin_tensor %11 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
    return %12 : !torch.vtensor<[32,4096],bf16>
  }
 }

 // -----// IR Dump After FuncConversionPass (torch-iree-func-conversion) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = torch_c.from_builtin_tensor %0 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
    %2 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %3 = torch_c.from_builtin_tensor %2 : tensor<4096x4096xbf16> -> !torch.vtensor<[4096,4096],bf16>
    %cst = arith.constant 0.000000e+00 : f32
    %4 = torch_c.to_builtin_tensor %3 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
    %5 = torch_c.to_builtin_tensor %1 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
    %6 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<32x4096xbf16>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %19 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %19 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %8 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<4096x4096xbf16>) outs(%8 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %19 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %19 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %12 = linalg.matmul ins(%7, %9 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<32x4096xf32>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %19 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %19 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %14 = tensor.empty() : tensor<32x4096xbf16>
    %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<32x4096xf8E4M3FNUZ>) outs(%14 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %19 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %19 : bf16
    } -> tensor<32x4096xbf16>
    %16 = torch_c.from_builtin_tensor %15 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
    %17 = hal.tensor.barrier join(%15 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %18 = hal.tensor.export %17 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %18 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before Canonicalizer (canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = torch_c.from_builtin_tensor %0 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  %2 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %3 = torch_c.from_builtin_tensor %2 : tensor<4096x4096xbf16> -> !torch.vtensor<[4096,4096],bf16>
  %cst = arith.constant 0.000000e+00 : f32
  %4 = torch_c.to_builtin_tensor %3 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
  %5 = torch_c.to_builtin_tensor %1 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
  %6 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<32x4096xbf16>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %19 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %19 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %8 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<4096x4096xbf16>) outs(%8 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %19 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %19 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xf32>
  %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %12 = linalg.matmul ins(%7, %9 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<32x4096xf32>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %19 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %19 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %14 = tensor.empty() : tensor<32x4096xbf16>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<32x4096xf8E4M3FNUZ>) outs(%14 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %19 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %19 : bf16
  } -> tensor<32x4096xbf16>
  %16 = torch_c.from_builtin_tensor %15 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  %17 = hal.tensor.barrier join(%15 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %18 = hal.tensor.export %17 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %18 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = torch_c.from_builtin_tensor %0 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  %2 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %3 = torch_c.from_builtin_tensor %2 : tensor<4096x4096xbf16> -> !torch.vtensor<[4096,4096],bf16>
  %4 = torch_c.to_builtin_tensor %3 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
  %5 = torch_c.to_builtin_tensor %1 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
  %6 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<32x4096xbf16>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %18 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %18 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %8 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<4096x4096xbf16>) outs(%8 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %18 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %18 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xf32>
  %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %12 = linalg.matmul ins(%7, %9 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<32x4096xf32>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %18 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %18 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %14 = tensor.empty() : tensor<32x4096xbf16>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<32x4096xf8E4M3FNUZ>) outs(%14 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %18 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %18 : bf16
  } -> tensor<32x4096xbf16>
  %16 = hal.tensor.barrier join(%15 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump Before Canonicalizer (canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %0 = util.null : !hal.fence
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before SymbolDCE (symbol-dce) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = torch_c.from_builtin_tensor %0 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
    %2 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %3 = torch_c.from_builtin_tensor %2 : tensor<4096x4096xbf16> -> !torch.vtensor<[4096,4096],bf16>
    %4 = torch_c.to_builtin_tensor %3 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
    %5 = torch_c.to_builtin_tensor %1 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
    %6 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<32x4096xbf16>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %18 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %18 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %8 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<4096x4096xbf16>) outs(%8 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %18 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %18 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %12 = linalg.matmul ins(%7, %9 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<32x4096xf32>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %18 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %18 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %14 = tensor.empty() : tensor<32x4096xbf16>
    %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<32x4096xf8E4M3FNUZ>) outs(%14 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %18 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %18 : bf16
    } -> tensor<32x4096xbf16>
    %16 = hal.tensor.barrier join(%15 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After SymbolDCE (symbol-dce) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = torch_c.from_builtin_tensor %0 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
    %2 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %3 = torch_c.from_builtin_tensor %2 : tensor<4096x4096xbf16> -> !torch.vtensor<[4096,4096],bf16>
    %4 = torch_c.to_builtin_tensor %3 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
    %5 = torch_c.to_builtin_tensor %1 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
    %6 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<32x4096xbf16>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %18 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %18 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %8 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<4096x4096xbf16>) outs(%8 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %18 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %18 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xf32>
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %12 = linalg.matmul ins(%7, %9 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<32x4096xf32>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %18 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %18 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %14 = tensor.empty() : tensor<32x4096xbf16>
    %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<32x4096xf8E4M3FNUZ>) outs(%14 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %18 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %18 : bf16
    } -> tensor<32x4096xbf16>
    %16 = hal.tensor.barrier join(%15 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before FinalizingBackendTypeConversion (torch-finalizing-backend-type-conversion) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = torch_c.from_builtin_tensor %0 : tensor<32x4096xbf16> -> !torch.vtensor<[32,4096],bf16>
  %2 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %3 = torch_c.from_builtin_tensor %2 : tensor<4096x4096xbf16> -> !torch.vtensor<[4096,4096],bf16>
  %4 = torch_c.to_builtin_tensor %3 : !torch.vtensor<[4096,4096],bf16> -> tensor<4096x4096xbf16>
  %5 = torch_c.to_builtin_tensor %1 : !torch.vtensor<[32,4096],bf16> -> tensor<32x4096xbf16>
  %6 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<32x4096xbf16>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %18 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %18 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %8 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<4096x4096xbf16>) outs(%8 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %18 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %18 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xf32>
  %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %12 = linalg.matmul ins(%7, %9 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<32x4096xf32>) outs(%6 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %18 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %18 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %14 = tensor.empty() : tensor<32x4096xbf16>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<32x4096xf8E4M3FNUZ>) outs(%14 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %18 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %18 : bf16
  } -> tensor<32x4096xbf16>
  %16 = hal.tensor.barrier join(%15 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After FinalizingBackendTypeConversion (torch-finalizing-backend-type-conversion) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before FinalizingBackendTypeConversion (torch-finalizing-backend-type-conversion) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After FinalizingBackendTypeConversion (torch-finalizing-backend-type-conversion) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before IREEImportPublicPass (iree-import-public) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before ImportMLProgramPass (iree-import-ml-program) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After ImportMLProgramPass (iree-import-ml-program) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before SanitizeModuleNamesPass (iree-sanitize-module-names) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After SanitizeModuleNamesPass (iree-sanitize-module-names) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before Inliner (inline) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before Canonicalizer (canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before Canonicalizer (canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After Inliner (inline) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before Canonicalizer (canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before Canonicalizer (canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before SymbolDCE (symbol-dce) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After SymbolDCE (symbol-dce) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
 module {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
 module attributes {hal.device.targets = [#hal.device.alias<"hip"> : !hal.device]} {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
 module attributes {hal.device.targets = [#hal.device.alias<"hip"> : !hal.device]} {
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.alias<"hip"> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.alias<"hip"> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.alias<"hip"> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.alias<"hip"> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before VerifyDevicesPass (iree-hal-verify-devices) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before GeneralizeLinalgMatMulPass (iree-preprocessing-generalize-linalg-matmul-experimental) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.matmul ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After GeneralizeLinalgMatMulPass (iree-preprocessing-generalize-linalg-matmul-experimental) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before StripDebugOpsPass (iree-util-strip-debug-ops) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After StripDebugOpsPass (iree-util-strip-debug-ops) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before GeneralizeLinalgMatMulPass (iree-preprocessing-generalize-linalg-matmul-experimental) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After GeneralizeLinalgMatMulPass (iree-preprocessing-generalize-linalg-matmul-experimental) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before StripDebugOpsPass (iree-util-strip-debug-ops) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After StripDebugOpsPass (iree-util-strip-debug-ops) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before DecomposeConcatPass (iree-global-opt-decompose-concat) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before DecomposeConcatPass (iree-global-opt-decompose-concat) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before PropagateLinalgTransposePass (iree-global-opt-propagate-linalg-transpose) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After PropagateLinalgTransposePass (iree-global-opt-propagate-linalg-transpose) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before ApplyPatternsPass (iree-util-apply-patterns) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before PropagateLinalgTransposePass (iree-global-opt-propagate-linalg-transpose) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After PropagateLinalgTransposePass (iree-global-opt-propagate-linalg-transpose) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before ApplyPatternsPass (iree-util-apply-patterns) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before FoldGlobalsPass (iree-util-fold-globals) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before IPOPass (iree-util-ipo) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After IPOPass (iree-util-ipo) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before JitGlobalsPass (iree-consteval-jit-globals) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After JitGlobalsPass (iree-consteval-jit-globals) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before ApplyPatternsPass (iree-util-apply-patterns) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before ApplyPatternsPass (iree-util-apply-patterns) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before FoldGlobalsPass (iree-util-fold-globals) //----- //
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before FuseGlobalsPass (iree-util-fuse-globals) //----- //
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before IPOPass (iree-util-ipo) //----- //
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After IPOPass (iree-util-ipo) //----- //
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: f32, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %10 = tensor.empty() : tensor<32x4096xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
    ^bb0(%in: f8E4M3FNUZ, %out: bf16):
      %14 = arith.extf %in : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %13 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump Before ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %14 = arith.extf %in : f8E4M3FNUZ to f32
    %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %16 = arith.mulf %14, %15 : f32
    %17 = arith.addf %out, %16 : f32
    linalg.yield %17 : f32
  } -> tensor<32x4096xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: f32, %out: f8E4M3FNUZ):
    %14 = arith.truncf %in : f32 to f8E4M3FNUZ
    linalg.yield %14 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %10 = tensor.empty() : tensor<32x4096xbf16>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<32x4096xf8E4M3FNUZ>) outs(%10 : tensor<32x4096xbf16>) {
  ^bb0(%in: f8E4M3FNUZ, %out: bf16):
    %14 = arith.extf %in : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %12 = hal.tensor.barrier join(%11 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %13 = hal.tensor.export %12 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %13 : !hal.buffer_view
 }

 // -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %13 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %13 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %13 = arith.extf %in : f8E4M3FNUZ to f32
      %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %15 = arith.mulf %13, %14 : f32
      %16 = arith.addf %out, %15 : f32
      linalg.yield %16 : f32
    } -> tensor<32x4096xf32>
    %9 = tensor.empty() : tensor<32x4096xbf16>
    %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
    ^bb0(%in: f32, %out: bf16):
      %13 = arith.truncf %in : f32 to f8E4M3FNUZ
      %14 = arith.extf %13 : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %13 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %13 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    %6 = tensor.empty() : tensor<32x4096xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %13 = arith.extf %in : f8E4M3FNUZ to f32
      %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %15 = arith.mulf %13, %14 : f32
      %16 = arith.addf %out, %15 : f32
      linalg.yield %16 : f32
    } -> tensor<32x4096xf32>
    %9 = tensor.empty() : tensor<32x4096xbf16>
    %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
    ^bb0(%in: f32, %out: bf16):
      %13 = arith.truncf %in : f32 to f8E4M3FNUZ
      %14 = arith.extf %13 : f8E4M3FNUZ to bf16
      linalg.yield %14 : bf16
    } -> tensor<32x4096xbf16>
    %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %12 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump Before Canonicalizer (canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump Before FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<32x4096xf8E4M3FNUZ>
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
  ^bb0(%in: bf16, %out: f8E4M3FNUZ):
    %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
    linalg.yield %13 : f8E4M3FNUZ
  } -> tensor<4096x4096xf8E4M3FNUZ>
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
  ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
    %13 = arith.extf %in : f8E4M3FNUZ to f32
    %14 = arith.extf %in_0 : f8E4M3FNUZ to f32
    %15 = arith.mulf %13, %14 : f32
    %16 = arith.addf %out, %15 : f32
    linalg.yield %16 : f32
  } -> tensor<32x4096xf32>
  %9 = tensor.empty() : tensor<32x4096xbf16>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
  ^bb0(%in: f32, %out: bf16):
    %13 = arith.truncf %in : f32 to f8E4M3FNUZ
    %14 = arith.extf %13 : f8E4M3FNUZ to bf16
    linalg.yield %14 : bf16
  } -> tensor<32x4096xbf16>
  %11 = hal.tensor.barrier join(%10 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %12 = hal.tensor.export %11 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %12 : !hal.buffer_view
 }

 // -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = flow.dispatch.region -> (tensor<32x4096xf8E4M3FNUZ>) {
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %13 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    flow.return %12 : tensor<32x4096xf8E4M3FNUZ>
  }
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = flow.dispatch.region -> (tensor<4096x4096xf8E4M3FNUZ>) {
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %13 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    flow.return %12 : tensor<4096x4096xf8E4M3FNUZ>
  }
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = tensor.empty() : tensor<32x4096xbf16>
  %9 = flow.dispatch.region -> (tensor<32x4096xbf16>) {
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<32x4096xf32>) outs(%8 : tensor<32x4096xbf16>) {
    ^bb0(%in: f32, %out: bf16):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      %15 = arith.extf %14 : f8E4M3FNUZ to bf16
      linalg.yield %15 : bf16
    } -> tensor<32x4096xbf16>
    flow.return %13 : tensor<32x4096xbf16>
  }
  %10 = hal.tensor.barrier join(%9 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump Before CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
  %3 = flow.dispatch.region -> (tensor<32x4096xf8E4M3FNUZ>) {
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%2 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %13 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    flow.return %12 : tensor<32x4096xf8E4M3FNUZ>
  }
  %4 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
  %5 = flow.dispatch.region -> (tensor<4096x4096xf8E4M3FNUZ>) {
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%4 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %13 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %13 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    flow.return %12 : tensor<4096x4096xf8E4M3FNUZ>
  }
  %6 = tensor.empty() : tensor<32x4096xf32>
  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
  %8 = tensor.empty() : tensor<32x4096xbf16>
  %9 = flow.dispatch.region -> (tensor<32x4096xbf16>) {
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %5 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%7 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<32x4096xf32>) outs(%8 : tensor<32x4096xbf16>) {
    ^bb0(%in: f32, %out: bf16):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      %15 = arith.extf %14 : f8E4M3FNUZ to bf16
      linalg.yield %15 : bf16
    } -> tensor<32x4096xbf16>
    flow.return %13 : tensor<32x4096xbf16>
  }
  %10 = hal.tensor.barrier join(%9 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = flow.dispatch.region -> (tensor<32x4096xf8E4M3FNUZ>) {
    %7 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%7 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %9 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %9 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    flow.return %8 : tensor<32x4096xf8E4M3FNUZ>
  }
  %3 = flow.dispatch.region -> (tensor<4096x4096xf8E4M3FNUZ>) {
    %7 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%7 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %9 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %9 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    flow.return %8 : tensor<4096x4096xf8E4M3FNUZ>
  }
  %4 = flow.dispatch.region -> (tensor<32x4096xbf16>) {
    %7 = tensor.empty() : tensor<32x4096xbf16>
    %8 = tensor.empty() : tensor<32x4096xf32>
    %cst = arith.constant 0.000000e+00 : f32
    %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2, %3 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%9 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %12 = arith.extf %in : f8E4M3FNUZ to f32
      %13 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %14 = arith.mulf %12, %13 : f32
      %15 = arith.addf %out, %14 : f32
      linalg.yield %15 : f32
    } -> tensor<32x4096xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<32x4096xf32>) outs(%7 : tensor<32x4096xbf16>) {
    ^bb0(%in: f32, %out: bf16):
      %12 = arith.truncf %in : f32 to f8E4M3FNUZ
      %13 = arith.extf %12 : f8E4M3FNUZ to bf16
      linalg.yield %13 : bf16
    } -> tensor<32x4096xbf16>
    flow.return %11 : tensor<32x4096xbf16>
  }
  %5 = hal.tensor.barrier join(%4 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %6 = hal.tensor.export %5 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %6 : !hal.buffer_view
 }

 // -----// IR Dump Before CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = flow.dispatch.region -> (tensor<32x4096xf8E4M3FNUZ>) {
    %7 = tensor.empty() : tensor<32x4096xf8E4M3FNUZ>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<32x4096xbf16>) outs(%7 : tensor<32x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %9 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %9 : f8E4M3FNUZ
    } -> tensor<32x4096xf8E4M3FNUZ>
    flow.return %8 : tensor<32x4096xf8E4M3FNUZ>
  }
  %3 = flow.dispatch.region -> (tensor<4096x4096xf8E4M3FNUZ>) {
    %7 = tensor.empty() : tensor<4096x4096xf8E4M3FNUZ>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<4096x4096xbf16>) outs(%7 : tensor<4096x4096xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %9 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %9 : f8E4M3FNUZ
    } -> tensor<4096x4096xf8E4M3FNUZ>
    flow.return %8 : tensor<4096x4096xf8E4M3FNUZ>
  }
  %4 = flow.dispatch.region -> (tensor<32x4096xbf16>) {
    %7 = tensor.empty() : tensor<32x4096xbf16>
    %8 = tensor.empty() : tensor<32x4096xf32>
    %cst = arith.constant 0.000000e+00 : f32
    %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2, %3 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%9 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %12 = arith.extf %in : f8E4M3FNUZ to f32
      %13 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %14 = arith.mulf %12, %13 : f32
      %15 = arith.addf %out, %14 : f32
      linalg.yield %15 : f32
    } -> tensor<32x4096xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<32x4096xf32>) outs(%7 : tensor<32x4096xbf16>) {
    ^bb0(%in: f32, %out: bf16):
      %12 = arith.truncf %in : f32 to f8E4M3FNUZ
      %13 = arith.extf %12 : f8E4M3FNUZ to bf16
      linalg.yield %13 : bf16
    } -> tensor<32x4096xbf16>
    flow.return %11 : tensor<32x4096xbf16>
  }
  %5 = hal.tensor.barrier join(%4 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %6 = hal.tensor.export %5 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %6 : !hal.buffer_view
 }

 // -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %collapsed = tensor.collapse_shape %0 [[0, 1]] : tensor<32x4096xbf16> into tensor<131072xbf16>
  %2 = flow.dispatch.region -> (tensor<131072xf8E4M3FNUZ>) {
    %7 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed : tensor<131072xbf16>) outs(%7 : tensor<131072xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %9 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %9 : f8E4M3FNUZ
    } -> tensor<131072xf8E4M3FNUZ>
    %expanded_2 = tensor.expand_shape %8 [[0, 1]] output_shape [32, 4096] : tensor<131072xf8E4M3FNUZ> into tensor<32x4096xf8E4M3FNUZ>
    flow.return %8 : tensor<131072xf8E4M3FNUZ>
  }
  %expanded = tensor.expand_shape %2 [[0, 1]] output_shape [32, 4096] : tensor<131072xf8E4M3FNUZ> into tensor<32x4096xf8E4M3FNUZ>
  %collapsed_0 = tensor.collapse_shape %1 [[0, 1]] : tensor<4096x4096xbf16> into tensor<16777216xbf16>
  %3 = flow.dispatch.region -> (tensor<16777216xf8E4M3FNUZ>) {
    %7 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed_0 : tensor<16777216xbf16>) outs(%7 : tensor<16777216xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %9 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %9 : f8E4M3FNUZ
    } -> tensor<16777216xf8E4M3FNUZ>
    %expanded_2 = tensor.expand_shape %8 [[0, 1]] output_shape [4096, 4096] : tensor<16777216xf8E4M3FNUZ> into tensor<4096x4096xf8E4M3FNUZ>
    flow.return %8 : tensor<16777216xf8E4M3FNUZ>
  }
  %expanded_1 = tensor.expand_shape %3 [[0, 1]] output_shape [4096, 4096] : tensor<16777216xf8E4M3FNUZ> into tensor<4096x4096xf8E4M3FNUZ>
  %4 = flow.dispatch.region -> (tensor<32x4096xbf16>) {
    %7 = tensor.empty() : tensor<32x4096xbf16>
    %8 = tensor.empty() : tensor<32x4096xf32>
    %cst = arith.constant 0.000000e+00 : f32
    %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%expanded, %expanded_1 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%9 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_2: f8E4M3FNUZ, %out: f32):
      %12 = arith.extf %in : f8E4M3FNUZ to f32
      %13 = arith.extf %in_2 : f8E4M3FNUZ to f32
      %14 = arith.mulf %12, %13 : f32
      %15 = arith.addf %out, %14 : f32
      linalg.yield %15 : f32
    } -> tensor<32x4096xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<32x4096xf32>) outs(%7 : tensor<32x4096xbf16>) {
    ^bb0(%in: f32, %out: bf16):
      %12 = arith.truncf %in : f32 to f8E4M3FNUZ
      %13 = arith.extf %12 : f8E4M3FNUZ to bf16
      linalg.yield %13 : bf16
    } -> tensor<32x4096xbf16>
    flow.return %11 : tensor<32x4096xbf16>
  }
  %5 = hal.tensor.barrier join(%4 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %6 = hal.tensor.export %5 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %6 : !hal.buffer_view
 }

 // -----// IR Dump Before ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %collapsed = tensor.collapse_shape %0 [[0, 1]] : tensor<32x4096xbf16> into tensor<131072xbf16>
  %2 = flow.dispatch.region -> (tensor<131072xf8E4M3FNUZ>) {
    %7 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed : tensor<131072xbf16>) outs(%7 : tensor<131072xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %9 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %9 : f8E4M3FNUZ
    } -> tensor<131072xf8E4M3FNUZ>
    %expanded_2 = tensor.expand_shape %8 [[0, 1]] output_shape [32, 4096] : tensor<131072xf8E4M3FNUZ> into tensor<32x4096xf8E4M3FNUZ>
    flow.return %8 : tensor<131072xf8E4M3FNUZ>
  }
  %expanded = tensor.expand_shape %2 [[0, 1]] output_shape [32, 4096] : tensor<131072xf8E4M3FNUZ> into tensor<32x4096xf8E4M3FNUZ>
  %collapsed_0 = tensor.collapse_shape %1 [[0, 1]] : tensor<4096x4096xbf16> into tensor<16777216xbf16>
  %3 = flow.dispatch.region -> (tensor<16777216xf8E4M3FNUZ>) {
    %7 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%collapsed_0 : tensor<16777216xbf16>) outs(%7 : tensor<16777216xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %9 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %9 : f8E4M3FNUZ
    } -> tensor<16777216xf8E4M3FNUZ>
    %expanded_2 = tensor.expand_shape %8 [[0, 1]] output_shape [4096, 4096] : tensor<16777216xf8E4M3FNUZ> into tensor<4096x4096xf8E4M3FNUZ>
    flow.return %8 : tensor<16777216xf8E4M3FNUZ>
  }
  %expanded_1 = tensor.expand_shape %3 [[0, 1]] output_shape [4096, 4096] : tensor<16777216xf8E4M3FNUZ> into tensor<4096x4096xf8E4M3FNUZ>
  %4 = flow.dispatch.region -> (tensor<32x4096xbf16>) {
    %7 = tensor.empty() : tensor<32x4096xbf16>
    %8 = tensor.empty() : tensor<32x4096xf32>
    %cst = arith.constant 0.000000e+00 : f32
    %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%expanded, %expanded_1 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%9 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_2: f8E4M3FNUZ, %out: f32):
      %12 = arith.extf %in : f8E4M3FNUZ to f32
      %13 = arith.extf %in_2 : f8E4M3FNUZ to f32
      %14 = arith.mulf %12, %13 : f32
      %15 = arith.addf %out, %14 : f32
      linalg.yield %15 : f32
    } -> tensor<32x4096xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<32x4096xf32>) outs(%7 : tensor<32x4096xbf16>) {
    ^bb0(%in: f32, %out: bf16):
      %12 = arith.truncf %in : f32 to f8E4M3FNUZ
      %13 = arith.extf %12 : f8E4M3FNUZ to bf16
      linalg.yield %13 : bf16
    } -> tensor<32x4096xbf16>
    flow.return %11 : tensor<32x4096xbf16>
  }
  %5 = hal.tensor.barrier join(%4 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %6 = hal.tensor.export %5 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %6 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %collapsed = tensor.collapse_shape %0 [[0, 1]] : tensor<32x4096xbf16> into tensor<131072xbf16>
  %2 = flow.dispatch.workgroups(%collapsed) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
    %7 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
    %8 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%7 : tensor<131072xbf16>) outs(%8 : tensor<131072xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %10 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %10 : f8E4M3FNUZ
    } -> tensor<131072xf8E4M3FNUZ>
    %expanded_2 = tensor.expand_shape %9 [[0, 1]] output_shape [32, 4096] : tensor<131072xf8E4M3FNUZ> into tensor<32x4096xf8E4M3FNUZ>
    flow.dispatch.tensor.store %9, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
    flow.return
  }
  %expanded = tensor.expand_shape %2 [[0, 1]] output_shape [32, 4096] : tensor<131072xf8E4M3FNUZ> into tensor<32x4096xf8E4M3FNUZ>
  %collapsed_0 = tensor.collapse_shape %1 [[0, 1]] : tensor<4096x4096xbf16> into tensor<16777216xbf16>
  %3 = flow.dispatch.workgroups(%collapsed_0) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
    %7 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
    %8 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%7 : tensor<16777216xbf16>) outs(%8 : tensor<16777216xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %10 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %10 : f8E4M3FNUZ
    } -> tensor<16777216xf8E4M3FNUZ>
    %expanded_2 = tensor.expand_shape %9 [[0, 1]] output_shape [4096, 4096] : tensor<16777216xf8E4M3FNUZ> into tensor<4096x4096xf8E4M3FNUZ>
    flow.dispatch.tensor.store %9, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
    flow.return
  }
  %expanded_1 = tensor.expand_shape %3 [[0, 1]] output_shape [4096, 4096] : tensor<16777216xf8E4M3FNUZ> into tensor<4096x4096xf8E4M3FNUZ>
  %4 = flow.dispatch.workgroups(%expanded, %expanded_1) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
    %7 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
    %8 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
    %9 = tensor.empty() : tensor<32x4096xbf16>
    %10 = tensor.empty() : tensor<32x4096xf32>
    %cst = arith.constant 0.000000e+00 : f32
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %8 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_2: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_2 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
    ^bb0(%in: f32, %out: bf16):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      %15 = arith.extf %14 : f8E4M3FNUZ to bf16
      linalg.yield %15 : bf16
    } -> tensor<32x4096xbf16>
    flow.dispatch.tensor.store %13, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
    flow.return
  }
  %5 = hal.tensor.barrier join(%4 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %6 = hal.tensor.export %5 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %6 : !hal.buffer_view
 }

 // -----// IR Dump Before ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %collapsed = tensor.collapse_shape %0 [[0, 1]] : tensor<32x4096xbf16> into tensor<131072xbf16>
  %2 = flow.dispatch.workgroups(%collapsed) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
    %7 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
    %8 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%7 : tensor<131072xbf16>) outs(%8 : tensor<131072xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %10 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %10 : f8E4M3FNUZ
    } -> tensor<131072xf8E4M3FNUZ>
    %expanded_2 = tensor.expand_shape %9 [[0, 1]] output_shape [32, 4096] : tensor<131072xf8E4M3FNUZ> into tensor<32x4096xf8E4M3FNUZ>
    flow.dispatch.tensor.store %9, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
    flow.return
  }
  %expanded = tensor.expand_shape %2 [[0, 1]] output_shape [32, 4096] : tensor<131072xf8E4M3FNUZ> into tensor<32x4096xf8E4M3FNUZ>
  %collapsed_0 = tensor.collapse_shape %1 [[0, 1]] : tensor<4096x4096xbf16> into tensor<16777216xbf16>
  %3 = flow.dispatch.workgroups(%collapsed_0) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
    %7 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
    %8 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%7 : tensor<16777216xbf16>) outs(%8 : tensor<16777216xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %10 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %10 : f8E4M3FNUZ
    } -> tensor<16777216xf8E4M3FNUZ>
    %expanded_2 = tensor.expand_shape %9 [[0, 1]] output_shape [4096, 4096] : tensor<16777216xf8E4M3FNUZ> into tensor<4096x4096xf8E4M3FNUZ>
    flow.dispatch.tensor.store %9, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
    flow.return
  }
  %expanded_1 = tensor.expand_shape %3 [[0, 1]] output_shape [4096, 4096] : tensor<16777216xf8E4M3FNUZ> into tensor<4096x4096xf8E4M3FNUZ>
  %4 = flow.dispatch.workgroups(%expanded, %expanded_1) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
    %7 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
    %8 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
    %9 = tensor.empty() : tensor<32x4096xbf16>
    %10 = tensor.empty() : tensor<32x4096xf32>
    %cst = arith.constant 0.000000e+00 : f32
    %11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %8 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%11 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_2: f8E4M3FNUZ, %out: f32):
      %14 = arith.extf %in : f8E4M3FNUZ to f32
      %15 = arith.extf %in_2 : f8E4M3FNUZ to f32
      %16 = arith.mulf %14, %15 : f32
      %17 = arith.addf %out, %16 : f32
      linalg.yield %17 : f32
    } -> tensor<32x4096xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<32x4096xf32>) outs(%9 : tensor<32x4096xbf16>) {
    ^bb0(%in: f32, %out: bf16):
      %14 = arith.truncf %in : f32 to f8E4M3FNUZ
      %15 = arith.extf %14 : f8E4M3FNUZ to bf16
      linalg.yield %15 : bf16
    } -> tensor<32x4096xbf16>
    flow.dispatch.tensor.store %13, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
    flow.return
  }
  %5 = hal.tensor.barrier join(%4 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %6 = hal.tensor.export %5 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %6 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = flow.tensor.reshape %0 : tensor<32x4096xbf16> -> tensor<131072xbf16>
  %3 = flow.dispatch.workgroups(%2) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
    %12 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<131072xbf16>) outs(%12 : tensor<131072xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<131072xf8E4M3FNUZ>
    flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
    flow.return
  }
  %4 = flow.tensor.reshape %3 : tensor<131072xf8E4M3FNUZ> -> tensor<32x4096xf8E4M3FNUZ>
  %5 = flow.tensor.reshape %1 : tensor<4096x4096xbf16> -> tensor<16777216xbf16>
  %6 = flow.dispatch.workgroups(%5) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
    %12 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<16777216xbf16>) outs(%12 : tensor<16777216xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<16777216xf8E4M3FNUZ>
    flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
    flow.return
  }
  %7 = flow.tensor.reshape %6 : tensor<16777216xf8E4M3FNUZ> -> tensor<4096x4096xf8E4M3FNUZ>
  %8 = flow.dispatch.workgroups(%4, %7) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
    %12 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
    %13 = tensor.empty() : tensor<32x4096xbf16>
    %14 = tensor.empty() : tensor<32x4096xf32>
    %15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%15 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %18 = arith.extf %in : f8E4M3FNUZ to f32
      %19 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %20 = arith.mulf %18, %19 : f32
      %21 = arith.addf %out, %20 : f32
      linalg.yield %21 : f32
    } -> tensor<32x4096xf32>
    %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<32x4096xf32>) outs(%13 : tensor<32x4096xbf16>) {
    ^bb0(%in: f32, %out: bf16):
      %18 = arith.truncf %in : f32 to f8E4M3FNUZ
      %19 = arith.extf %18 : f8E4M3FNUZ to bf16
      linalg.yield %19 : bf16
    } -> tensor<32x4096xbf16>
    flow.dispatch.tensor.store %17, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
    flow.return
  }
  %9 = hal.tensor.barrier join(%8 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %10 = hal.tensor.export %9 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = flow.tensor.reshape %0 : tensor<32x4096xbf16> -> tensor<131072xbf16>
  %3 = flow.dispatch.workgroups(%2) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
    %12 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<131072xbf16>) outs(%12 : tensor<131072xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<131072xf8E4M3FNUZ>
    flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
    flow.return
  }
  %4 = flow.tensor.reshape %3 : tensor<131072xf8E4M3FNUZ> -> tensor<32x4096xf8E4M3FNUZ>
  %5 = flow.tensor.reshape %1 : tensor<4096x4096xbf16> -> tensor<16777216xbf16>
  %6 = flow.dispatch.workgroups(%5) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
    %12 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<16777216xbf16>) outs(%12 : tensor<16777216xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<16777216xf8E4M3FNUZ>
    flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
    flow.return
  }
  %7 = flow.tensor.reshape %6 : tensor<16777216xf8E4M3FNUZ> -> tensor<4096x4096xf8E4M3FNUZ>
  %8 = flow.dispatch.workgroups(%4, %7) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
    %12 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
    %13 = tensor.empty() : tensor<32x4096xbf16>
    %14 = tensor.empty() : tensor<32x4096xf32>
    %15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%15 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %18 = arith.extf %in : f8E4M3FNUZ to f32
      %19 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %20 = arith.mulf %18, %19 : f32
      %21 = arith.addf %out, %20 : f32
      linalg.yield %21 : f32
    } -> tensor<32x4096xf32>
    %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<32x4096xf32>) outs(%13 : tensor<32x4096xbf16>) {
    ^bb0(%in: f32, %out: bf16):
      %18 = arith.truncf %in : f32 to f8E4M3FNUZ
      %19 = arith.extf %18 : f8E4M3FNUZ to bf16
      linalg.yield %19 : bf16
    } -> tensor<32x4096xbf16>
    flow.dispatch.tensor.store %17, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
    flow.return
  }
  %9 = hal.tensor.barrier join(%8 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %10 = hal.tensor.export %9 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = flow.tensor.reshape %0 : tensor<32x4096xbf16> -> tensor<131072xbf16>
  %3 = flow.dispatch.workgroups(%2) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
    %12 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<131072xbf16>) outs(%12 : tensor<131072xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<131072xf8E4M3FNUZ>
    flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
    flow.return
  }
  %4 = flow.tensor.reshape %3 : tensor<131072xf8E4M3FNUZ> -> tensor<32x4096xf8E4M3FNUZ>
  %5 = flow.tensor.reshape %1 : tensor<4096x4096xbf16> -> tensor<16777216xbf16>
  %6 = flow.dispatch.workgroups(%5) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
    %12 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<16777216xbf16>) outs(%12 : tensor<16777216xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<16777216xf8E4M3FNUZ>
    flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
    flow.return
  }
  %7 = flow.tensor.reshape %6 : tensor<16777216xf8E4M3FNUZ> -> tensor<4096x4096xf8E4M3FNUZ>
  %8 = flow.dispatch.workgroups(%4, %7) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
    %12 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
    %13 = tensor.empty() : tensor<32x4096xbf16>
    %14 = tensor.empty() : tensor<32x4096xf32>
    %15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%15 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %18 = arith.extf %in : f8E4M3FNUZ to f32
      %19 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %20 = arith.mulf %18, %19 : f32
      %21 = arith.addf %out, %20 : f32
      linalg.yield %21 : f32
    } -> tensor<32x4096xf32>
    %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<32x4096xf32>) outs(%13 : tensor<32x4096xbf16>) {
    ^bb0(%in: f32, %out: bf16):
      %18 = arith.truncf %in : f32 to f8E4M3FNUZ
      %19 = arith.extf %18 : f8E4M3FNUZ to bf16
      linalg.yield %19 : bf16
    } -> tensor<32x4096xbf16>
    flow.dispatch.tensor.store %17, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
    flow.return
  }
  %9 = hal.tensor.barrier join(%8 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %10 = hal.tensor.export %9 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = flow.tensor.reshape %0 : tensor<32x4096xbf16> -> tensor<131072xbf16>
  %3 = flow.dispatch.workgroups(%2) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
    %12 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<131072xbf16>) outs(%12 : tensor<131072xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<131072xf8E4M3FNUZ>
    flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
    flow.return
  }
  %4 = flow.tensor.reshape %3 : tensor<131072xf8E4M3FNUZ> -> tensor<32x4096xf8E4M3FNUZ>
  %5 = flow.tensor.reshape %1 : tensor<4096x4096xbf16> -> tensor<16777216xbf16>
  %6 = flow.dispatch.workgroups(%5) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
    %12 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<16777216xbf16>) outs(%12 : tensor<16777216xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<16777216xf8E4M3FNUZ>
    flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
    flow.return
  }
  %7 = flow.tensor.reshape %6 : tensor<16777216xf8E4M3FNUZ> -> tensor<4096x4096xf8E4M3FNUZ>
  %8 = flow.dispatch.workgroups(%4, %7) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
    %12 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
    %13 = tensor.empty() : tensor<32x4096xbf16>
    %14 = tensor.empty() : tensor<32x4096xf32>
    %15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%15 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %18 = arith.extf %in : f8E4M3FNUZ to f32
      %19 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %20 = arith.mulf %18, %19 : f32
      %21 = arith.addf %out, %20 : f32
      linalg.yield %21 : f32
    } -> tensor<32x4096xf32>
    %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<32x4096xf32>) outs(%13 : tensor<32x4096xbf16>) {
    ^bb0(%in: f32, %out: bf16):
      %18 = arith.truncf %in : f32 to f8E4M3FNUZ
      %19 = arith.extf %18 : f8E4M3FNUZ to bf16
      linalg.yield %19 : bf16
    } -> tensor<32x4096xbf16>
    flow.dispatch.tensor.store %17, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
    flow.return
  }
  %9 = hal.tensor.barrier join(%8 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %10 = hal.tensor.export %9 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = flow.tensor.reshape %0 : tensor<32x4096xbf16> -> tensor<131072xbf16>
  %3 = flow.dispatch.workgroups(%2) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
    %12 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<131072xbf16>) outs(%12 : tensor<131072xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<131072xf8E4M3FNUZ>
    flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
    flow.return
  }
  %4 = flow.tensor.reshape %3 : tensor<131072xf8E4M3FNUZ> -> tensor<32x4096xf8E4M3FNUZ>
  %5 = flow.tensor.reshape %1 : tensor<4096x4096xbf16> -> tensor<16777216xbf16>
  %6 = flow.dispatch.workgroups(%5) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
    %12 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<16777216xbf16>) outs(%12 : tensor<16777216xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<16777216xf8E4M3FNUZ>
    flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
    flow.return
  }
  %7 = flow.tensor.reshape %6 : tensor<16777216xf8E4M3FNUZ> -> tensor<4096x4096xf8E4M3FNUZ>
  %8 = flow.dispatch.workgroups(%4, %7) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
    %12 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
    %13 = tensor.empty() : tensor<32x4096xbf16>
    %14 = tensor.empty() : tensor<32x4096xf32>
    %15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%15 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %18 = arith.extf %in : f8E4M3FNUZ to f32
      %19 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %20 = arith.mulf %18, %19 : f32
      %21 = arith.addf %out, %20 : f32
      linalg.yield %21 : f32
    } -> tensor<32x4096xf32>
    %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<32x4096xf32>) outs(%13 : tensor<32x4096xbf16>) {
    ^bb0(%in: f32, %out: bf16):
      %18 = arith.truncf %in : f32 to f8E4M3FNUZ
      %19 = arith.extf %18 : f8E4M3FNUZ to bf16
      linalg.yield %19 : bf16
    } -> tensor<32x4096xbf16>
    flow.dispatch.tensor.store %17, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
    flow.return
  }
  %9 = hal.tensor.barrier join(%8 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %10 = hal.tensor.export %9 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump Before MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = flow.tensor.reshape %0 : tensor<32x4096xbf16> -> tensor<131072xbf16>
  %3 = flow.dispatch.workgroups(%2) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
    %12 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<131072xbf16>) outs(%12 : tensor<131072xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<131072xf8E4M3FNUZ>
    flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
    flow.return
  }
  %4 = flow.tensor.reshape %3 : tensor<131072xf8E4M3FNUZ> -> tensor<32x4096xf8E4M3FNUZ>
  %5 = flow.tensor.reshape %1 : tensor<4096x4096xbf16> -> tensor<16777216xbf16>
  %6 = flow.dispatch.workgroups(%5) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
    %12 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<16777216xbf16>) outs(%12 : tensor<16777216xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<16777216xf8E4M3FNUZ>
    flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
    flow.return
  }
  %7 = flow.tensor.reshape %6 : tensor<16777216xf8E4M3FNUZ> -> tensor<4096x4096xf8E4M3FNUZ>
  %8 = flow.dispatch.workgroups(%4, %7) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
    %12 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
    %13 = tensor.empty() : tensor<32x4096xbf16>
    %14 = tensor.empty() : tensor<32x4096xf32>
    %15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%15 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %18 = arith.extf %in : f8E4M3FNUZ to f32
      %19 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %20 = arith.mulf %18, %19 : f32
      %21 = arith.addf %out, %20 : f32
      linalg.yield %21 : f32
    } -> tensor<32x4096xf32>
    %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<32x4096xf32>) outs(%13 : tensor<32x4096xbf16>) {
    ^bb0(%in: f32, %out: bf16):
      %18 = arith.truncf %in : f32 to f8E4M3FNUZ
      %19 = arith.extf %18 : f8E4M3FNUZ to bf16
      linalg.yield %19 : bf16
    } -> tensor<32x4096xbf16>
    flow.dispatch.tensor.store %17, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
    flow.return
  }
  %9 = hal.tensor.barrier join(%8 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %10 = hal.tensor.export %9 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- //
 util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
  %2 = flow.tensor.reshape %0 : tensor<32x4096xbf16> -> tensor<131072xbf16>
  %3 = flow.dispatch.workgroups(%2) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
    %12 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<131072xbf16>) outs(%12 : tensor<131072xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<131072xf8E4M3FNUZ>
    flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
    flow.return %x, %y, %z : index, index, index
  }
  %4 = flow.tensor.reshape %3 : tensor<131072xf8E4M3FNUZ> -> tensor<32x4096xf8E4M3FNUZ>
  %5 = flow.tensor.reshape %1 : tensor<4096x4096xbf16> -> tensor<16777216xbf16>
  %6 = flow.dispatch.workgroups(%5) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
    %12 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<16777216xbf16>) outs(%12 : tensor<16777216xf8E4M3FNUZ>) {
    ^bb0(%in: bf16, %out: f8E4M3FNUZ):
      %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
      linalg.yield %14 : f8E4M3FNUZ
    } -> tensor<16777216xf8E4M3FNUZ>
    flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
    flow.return %x, %y, %z : index, index, index
  }
  %7 = flow.tensor.reshape %6 : tensor<16777216xf8E4M3FNUZ> -> tensor<4096x4096xf8E4M3FNUZ>
  %8 = flow.dispatch.workgroups(%4, %7) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %11 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
    %12 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
    %13 = tensor.empty() : tensor<32x4096xbf16>
    %14 = tensor.empty() : tensor<32x4096xf32>
    %15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
    %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%15 : tensor<32x4096xf32>) {
    ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
      %18 = arith.extf %in : f8E4M3FNUZ to f32
      %19 = arith.extf %in_0 : f8E4M3FNUZ to f32
      %20 = arith.mulf %18, %19 : f32
      %21 = arith.addf %out, %20 : f32
      linalg.yield %21 : f32
    } -> tensor<32x4096xf32>
    %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<32x4096xf32>) outs(%13 : tensor<32x4096xbf16>) {
    ^bb0(%in: f32, %out: bf16):
      %18 = arith.truncf %in : f32 to f8E4M3FNUZ
      %19 = arith.extf %18 : f8E4M3FNUZ to bf16
      linalg.yield %19 : bf16
    } -> tensor<32x4096xbf16>
    flow.dispatch.tensor.store %17, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
    flow.return %x, %y, %z : index, index, index
  }
  %9 = hal.tensor.barrier join(%8 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
  %10 = hal.tensor.export %9 : tensor<32x4096xbf16> -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump Before Canonicalizer (canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- //
 util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump Before VerifyInputLegalityPass (iree-verify-input-legality) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = flow.tensor.reshape %0 : tensor<32x4096xbf16> -> tensor<131072xbf16>
    %3 = flow.dispatch.workgroups(%2) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
        (%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
      %11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
      %12 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
      %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<131072xbf16>) outs(%12 : tensor<131072xf8E4M3FNUZ>) {
      ^bb0(%in: bf16, %out: f8E4M3FNUZ):
        %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
        linalg.yield %14 : f8E4M3FNUZ
      } -> tensor<131072xf8E4M3FNUZ>
      flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
      flow.return
    } count() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    %4 = flow.tensor.reshape %3 : tensor<131072xf8E4M3FNUZ> -> tensor<32x4096xf8E4M3FNUZ>
    %5 = flow.tensor.reshape %1 : tensor<4096x4096xbf16> -> tensor<16777216xbf16>
    %6 = flow.dispatch.workgroups(%5) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
        (%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
      %11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
      %12 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
      %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<16777216xbf16>) outs(%12 : tensor<16777216xf8E4M3FNUZ>) {
      ^bb0(%in: bf16, %out: f8E4M3FNUZ):
        %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
        linalg.yield %14 : f8E4M3FNUZ
      } -> tensor<16777216xf8E4M3FNUZ>
      flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
      flow.return
    } count() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    %7 = flow.tensor.reshape %6 : tensor<16777216xf8E4M3FNUZ> -> tensor<4096x4096xf8E4M3FNUZ>
    %8 = flow.dispatch.workgroups(%4, %7) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
        (%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %11 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
      %12 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
      %13 = tensor.empty() : tensor<32x4096xbf16>
      %14 = tensor.empty() : tensor<32x4096xf32>
      %15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
      %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%15 : tensor<32x4096xf32>) {
      ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
        %18 = arith.extf %in : f8E4M3FNUZ to f32
        %19 = arith.extf %in_0 : f8E4M3FNUZ to f32
        %20 = arith.mulf %18, %19 : f32
        %21 = arith.addf %out, %20 : f32
        linalg.yield %21 : f32
      } -> tensor<32x4096xf32>
      %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<32x4096xf32>) outs(%13 : tensor<32x4096xbf16>) {
      ^bb0(%in: f32, %out: bf16):
        %18 = arith.truncf %in : f32 to f8E4M3FNUZ
        %19 = arith.extf %18 : f8E4M3FNUZ to bf16
        linalg.yield %19 : bf16
      } -> tensor<32x4096xbf16>
      flow.dispatch.tensor.store %17, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
      flow.return
    } count() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    %9 = hal.tensor.barrier join(%8 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %10 = hal.tensor.export %9 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @faulty$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.return %1 : !hal.buffer_view
  }
 }

 // -----// IR Dump After VerifyInputLegalityPass (iree-verify-input-legality) //----- //
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #hal.device.target<"hip", [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>]> : !hal.device
  util.func public @faulty$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<32x4096xbf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<4096x4096xbf16>
    %2 = flow.tensor.reshape %0 : tensor<32x4096xbf16> -> tensor<131072xbf16>
    %3 = flow.dispatch.workgroups(%2) : (tensor<131072xbf16>) -> tensor<131072xf8E4M3FNUZ> =
        (%arg4: !flow.dispatch.tensor<readonly:tensor<131072xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>) {
      %11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xbf16>> -> tensor<131072xbf16>
      %12 = tensor.empty() : tensor<131072xf8E4M3FNUZ>
      %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<131072xbf16>) outs(%12 : tensor<131072xf8E4M3FNUZ>) {
      ^bb0(%in: bf16, %out: f8E4M3FNUZ):
        %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
        linalg.yield %14 : f8E4M3FNUZ
      } -> tensor<131072xf8E4M3FNUZ>
      flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [131072], strides = [1] : tensor<131072xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<131072xf8E4M3FNUZ>>
      flow.return
    } count() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    %4 = flow.tensor.reshape %3 : tensor<131072xf8E4M3FNUZ> -> tensor<32x4096xf8E4M3FNUZ>
    %5 = flow.tensor.reshape %1 : tensor<4096x4096xbf16> -> tensor<16777216xbf16>
    %6 = flow.dispatch.workgroups(%5) : (tensor<16777216xbf16>) -> tensor<16777216xf8E4M3FNUZ> =
        (%arg4: !flow.dispatch.tensor<readonly:tensor<16777216xbf16>>, %arg5: !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>) {
      %11 = flow.dispatch.tensor.load %arg4, offsets = [0], sizes = [16777216], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16777216xbf16>> -> tensor<16777216xbf16>
      %12 = tensor.empty() : tensor<16777216xf8E4M3FNUZ>
      %13 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%11 : tensor<16777216xbf16>) outs(%12 : tensor<16777216xf8E4M3FNUZ>) {
      ^bb0(%in: bf16, %out: f8E4M3FNUZ):
        %14 = arith.truncf %in : bf16 to f8E4M3FNUZ
        linalg.yield %14 : f8E4M3FNUZ
      } -> tensor<16777216xf8E4M3FNUZ>
      flow.dispatch.tensor.store %13, %arg5, offsets = [0], sizes = [16777216], strides = [1] : tensor<16777216xf8E4M3FNUZ> -> !flow.dispatch.tensor<writeonly:tensor<16777216xf8E4M3FNUZ>>
      flow.return
    } count() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    %7 = flow.tensor.reshape %6 : tensor<16777216xf8E4M3FNUZ> -> tensor<4096x4096xf8E4M3FNUZ>
    %8 = flow.dispatch.workgroups(%4, %7) : (tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) -> tensor<32x4096xbf16> =
        (%arg4: !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>>, %arg5: !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %11 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x4096xf8E4M3FNUZ>> -> tensor<32x4096xf8E4M3FNUZ>
      %12 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf8E4M3FNUZ>> -> tensor<4096x4096xf8E4M3FNUZ>
      %13 = tensor.empty() : tensor<32x4096xbf16>
      %14 = tensor.empty() : tensor<32x4096xf32>
      %15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<32x4096xf32>) -> tensor<32x4096xf32>
      %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%11, %12 : tensor<32x4096xf8E4M3FNUZ>, tensor<4096x4096xf8E4M3FNUZ>) outs(%15 : tensor<32x4096xf32>) {
      ^bb0(%in: f8E4M3FNUZ, %in_0: f8E4M3FNUZ, %out: f32):
        %18 = arith.extf %in : f8E4M3FNUZ to f32
        %19 = arith.extf %in_0 : f8E4M3FNUZ to f32
        %20 = arith.mulf %18, %19 : f32
        %21 = arith.addf %out, %20 : f32
        linalg.yield %21 : f32
      } -> tensor<32x4096xf32>
      %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<32x4096xf32>) outs(%13 : tensor<32x4096xbf16>) {
      ^bb0(%in: f32, %out: bf16):
        %18 = arith.truncf %in : f32 to f8E4M3FNUZ
        %19 = arith.extf %18 : f8E4M3FNUZ to bf16
        linalg.yield %19 : bf16
      } -> tensor<32x4096xbf16>
      flow.dispatch.tensor.store %17, %arg6, offsets = [0, 0], sizes = [32, 4096], strides = [1, 1] : tensor<32x4096xbf16> -> !flow.dispatch.tensor<writeonly:tensor<32x4096xbf16>>
      flow.return
    } count() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    %9 = hal.tensor.barrier join(%8 : tensor<32x4096xbf16>) => %arg3 : !hal.fence
    %10 = hal.tensor.export %9 : tensor<32x4096xbf16> -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
  util.func public @faulty(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1