pashu123 · April 9, 2025 02:55
diff --git a/err.txt b/err.txt
 // -----// IR Dump After BindSymbolicShapesPass (torch-iree-bind-symbolic-shapes) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %int6 = torch.constant.int 6
  %0 = torch.prims.convert_element_type %arg0, %int6 : !torch.vtensor<[2,32,10,16384],f16>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
  %int2 = torch.constant.int 2
  %int3 = torch.constant.int 3
  %1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
  %int0 = torch.constant.int 0
  %true = torch.constant.bool true
  %result0, %result1 = torch.aten.var_mean.correction %0, %1, %int0, %true : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[2,32,1,1],f32>, !torch.vtensor<[2,32,1,1],f32>
  %float1.000000e-05 = torch.constant.float 1.000000e-05
  %int1 = torch.constant.int 1
  %2 = torch.aten.add.Scalar %result0, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32>
  %3 = torch.aten.rsqrt %2 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32>
  %int1_0 = torch.constant.int 1
  %4 = torch.aten.sub.Tensor %arg1, %result1, %int1_0 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
  %5 = torch.aten.mul.Tensor %4, %3 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %5 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After SetStrictSymbolicShapesPass (torch-iree-set-strict-symbolic-shapes) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %int6 = torch.constant.int 6
  %0 = torch.prims.convert_element_type %arg0, %int6 : !torch.vtensor<[2,32,10,16384],f16>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
  %int2 = torch.constant.int 2
  %int3 = torch.constant.int 3
  %1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
  %int0 = torch.constant.int 0
  %true = torch.constant.bool true
  %result0, %result1 = torch.aten.var_mean.correction %0, %1, %int0, %true : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[2,32,1,1],f32>, !torch.vtensor<[2,32,1,1],f32>
  %float1.000000e-05 = torch.constant.float 1.000000e-05
  %int1 = torch.constant.int 1
  %2 = torch.aten.add.Scalar %result0, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32>
  %3 = torch.aten.rsqrt %2 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32>
  %int1_0 = torch.constant.int 1
  %4 = torch.aten.sub.Tensor %arg1, %result1, %int1_0 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
  %5 = torch.aten.mul.Tensor %4, %3 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %5 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %int1 = torch.constant.int 1
  %float1.000000e-05 = torch.constant.float 1.000000e-05
  %true = torch.constant.bool true
  %int0 = torch.constant.int 0
  %int3 = torch.constant.int 3
  %int2 = torch.constant.int 2
  %int6 = torch.constant.int 6
  %0 = torch.prims.convert_element_type %arg0, %int6 : !torch.vtensor<[2,32,10,16384],f16>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
  %1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
  %result0, %result1 = torch.aten.var_mean.correction %0, %1, %int0, %true : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[2,32,1,1],f32>, !torch.vtensor<[2,32,1,1],f32>
  %2 = torch.aten.add.Scalar %result0, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32>
  %3 = torch.aten.rsqrt %2 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32>
  %4 = torch.aten.sub.Tensor %arg1, %result1, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
  %5 = torch.aten.mul.Tensor %4, %3 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %5 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After BitCastQuantTensorPass (torch-iree-bitcast-quant-tensor) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %int1 = torch.constant.int 1
  %float1.000000e-05 = torch.constant.float 1.000000e-05
  %true = torch.constant.bool true
  %int0 = torch.constant.int 0
  %int3 = torch.constant.int 3
  %int2 = torch.constant.int 2
  %int6 = torch.constant.int 6
  %0 = torch.prims.convert_element_type %arg0, %int6 : !torch.vtensor<[2,32,10,16384],f16>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
  %1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
  %result0, %result1 = torch.aten.var_mean.correction %0, %1, %int0, %true : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[2,32,1,1],f32>, !torch.vtensor<[2,32,1,1],f32>
  %2 = torch.aten.add.Scalar %result0, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32>
  %3 = torch.aten.rsqrt %2 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32>
  %4 = torch.aten.sub.Tensor %arg1, %result1, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
  %5 = torch.aten.mul.Tensor %4, %3 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %5 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After ReduceOpVariants (torch-reduce-op-variants) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %int1 = torch.constant.int 1
  %float1.000000e-05 = torch.constant.float 1.000000e-05
  %true = torch.constant.bool true
  %int0 = torch.constant.int 0
  %int3 = torch.constant.int 3
  %int2 = torch.constant.int 2
  %int6 = torch.constant.int 6
  %0 = torch.prims.convert_element_type %arg0, %int6 : !torch.vtensor<[2,32,10,16384],f16>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
  %1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
  %result0, %result1 = torch.aten.var_mean.correction %0, %1, %int0, %true : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[2,32,1,1],f32>, !torch.vtensor<[2,32,1,1],f32>
  %2 = torch.aten.add.Scalar %result0, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32>
  %3 = torch.aten.rsqrt %2 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32>
  %4 = torch.aten.sub.Tensor %arg1, %result1, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
  %5 = torch.aten.mul.Tensor %4, %3 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %5 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After ConvertCustomQuantOp (torch-convert-custom-quant-op) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %int1 = torch.constant.int 1
  %float1.000000e-05 = torch.constant.float 1.000000e-05
  %true = torch.constant.bool true
  %int0 = torch.constant.int 0
  %int3 = torch.constant.int 3
  %int2 = torch.constant.int 2
  %int6 = torch.constant.int 6
  %0 = torch.prims.convert_element_type %arg0, %int6 : !torch.vtensor<[2,32,10,16384],f16>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
  %1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
  %result0, %result1 = torch.aten.var_mean.correction %0, %1, %int0, %true : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[2,32,1,1],f32>, !torch.vtensor<[2,32,1,1],f32>
  %2 = torch.aten.add.Scalar %result0, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32>
  %3 = torch.aten.rsqrt %2 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32>
  %4 = torch.aten.sub.Tensor %arg1, %result1, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
  %5 = torch.aten.mul.Tensor %4, %3 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %5 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After DecomposeComplexOps (torch-decompose-complex-ops) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %int163840 = torch.constant.int 163840
  %int7 = torch.constant.int 7
  %int1 = torch.constant.int 1
  %float1.000000e-05 = torch.constant.float 1.000000e-05
  %true = torch.constant.bool true
  %int3 = torch.constant.int 3
  %int2 = torch.constant.int 2
  %int6 = torch.constant.int 6
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int6, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f32>
  %1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
  %2 = torch.aten.to.dtype %0, %int7, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f64>
  %3 = torch.aten.sum.dim_IntList %2, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64>
  %4 = torch.aten.div.Scalar %3, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64>
  %5 = torch.aten.sub.Tensor %2, %4, %float1.000000e00 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,1,1],f64>, !torch.float -> !torch.vtensor<[2,32,10,16384],f64>
  %6 = torch.aten.mul.Tensor %5, %5 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,10,16384],f64> -> !torch.vtensor<[2,32,10,16384],f64>
  %7 = torch.aten.sum.dim_IntList %6, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64>
  %8 = torch.aten.div.Scalar %7, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64>
  %9 = torch.aten.to.dtype %8, %int6, %false, %false, %none : !torch.vtensor<[2,32,1,1],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32>
  %10 = torch.aten.sum.dim_IntList %0, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32>
  %11 = torch.aten.div.Scalar %10, %int163840 : !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,1,1],f32>
  %12 = torch.aten.add.Scalar %9, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32>
  %13 = torch.aten.rsqrt %12 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32>
  %14 = torch.aten.sub.Tensor %arg1, %11, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
  %15 = torch.aten.mul.Tensor %14, %13 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %15 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After FuseQuantizedOps (torch-fuse-quantized-ops) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %int163840 = torch.constant.int 163840
  %int7 = torch.constant.int 7
  %int1 = torch.constant.int 1
  %float1.000000e-05 = torch.constant.float 1.000000e-05
  %true = torch.constant.bool true
  %int3 = torch.constant.int 3
  %int2 = torch.constant.int 2
  %int6 = torch.constant.int 6
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int6, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f32>
  %1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
  %2 = torch.aten.to.dtype %0, %int7, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f64>
  %3 = torch.aten.sum.dim_IntList %2, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64>
  %4 = torch.aten.div.Scalar %3, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64>
  %5 = torch.aten.sub.Tensor %2, %4, %float1.000000e00 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,1,1],f64>, !torch.float -> !torch.vtensor<[2,32,10,16384],f64>
  %6 = torch.aten.mul.Tensor %5, %5 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,10,16384],f64> -> !torch.vtensor<[2,32,10,16384],f64>
  %7 = torch.aten.sum.dim_IntList %6, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64>
  %8 = torch.aten.div.Scalar %7, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64>
  %9 = torch.aten.to.dtype %8, %int6, %false, %false, %none : !torch.vtensor<[2,32,1,1],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32>
  %10 = torch.aten.sum.dim_IntList %0, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32>
  %11 = torch.aten.div.Scalar %10, %int163840 : !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,1,1],f32>
  %12 = torch.aten.add.Scalar %9, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32>
  %13 = torch.aten.rsqrt %12 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32>
  %14 = torch.aten.sub.Tensor %arg1, %11, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
  %15 = torch.aten.mul.Tensor %14, %13 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %15 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %int163840 = torch.constant.int 163840
  %int7 = torch.constant.int 7
  %int1 = torch.constant.int 1
  %float1.000000e-05 = torch.constant.float 1.000000e-05
  %true = torch.constant.bool true
  %int3 = torch.constant.int 3
  %int2 = torch.constant.int 2
  %int6 = torch.constant.int 6
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int6, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f32>
  %1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
  %2 = torch.aten.to.dtype %0, %int7, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f64>
  %3 = torch.aten.sum.dim_IntList %2, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64>
  %4 = torch.aten.div.Scalar %3, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64>
  %5 = torch.aten.sub.Tensor %2, %4, %float1.000000e00 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,1,1],f64>, !torch.float -> !torch.vtensor<[2,32,10,16384],f64>
  %6 = torch.aten.mul.Tensor %5, %5 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,10,16384],f64> -> !torch.vtensor<[2,32,10,16384],f64>
  %7 = torch.aten.sum.dim_IntList %6, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64>
  %8 = torch.aten.div.Scalar %7, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64>
  %9 = torch.aten.to.dtype %8, %int6, %false, %false, %none : !torch.vtensor<[2,32,1,1],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32>
  %10 = torch.aten.sum.dim_IntList %0, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32>
  %11 = torch.aten.div.Scalar %10, %int163840 : !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,1,1],f32>
  %12 = torch.aten.add.Scalar %9, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32>
  %13 = torch.aten.rsqrt %12 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32>
  %14 = torch.aten.sub.Tensor %arg1, %11, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
  %15 = torch.aten.mul.Tensor %14, %13 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %15 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After ScalarizeShapes (torch-scalarize-shapes) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %int163840 = torch.constant.int 163840
  %int7 = torch.constant.int 7
  %int1 = torch.constant.int 1
  %float1.000000e-05 = torch.constant.float 1.000000e-05
  %true = torch.constant.bool true
  %int3 = torch.constant.int 3
  %int2 = torch.constant.int 2
  %int6 = torch.constant.int 6
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int6, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f32>
  %1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
  %2 = torch.aten.to.dtype %0, %int7, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f64>
  %3 = torch.aten.sum.dim_IntList %2, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64>
  %4 = torch.aten.div.Scalar %3, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64>
  %5 = torch.aten.sub.Tensor %2, %4, %float1.000000e00 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,1,1],f64>, !torch.float -> !torch.vtensor<[2,32,10,16384],f64>
  %6 = torch.aten.mul.Tensor %5, %5 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,10,16384],f64> -> !torch.vtensor<[2,32,10,16384],f64>
  %7 = torch.aten.sum.dim_IntList %6, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64>
  %8 = torch.aten.div.Scalar %7, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64>
  %9 = torch.aten.to.dtype %8, %int6, %false, %false, %none : !torch.vtensor<[2,32,1,1],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32>
  %10 = torch.aten.sum.dim_IntList %0, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32>
  %11 = torch.aten.div.Scalar %10, %int163840 : !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,1,1],f32>
  %12 = torch.aten.add.Scalar %9, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32>
  %13 = torch.aten.rsqrt %12 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32>
  %14 = torch.aten.sub.Tensor %arg1, %11, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
  %15 = torch.aten.mul.Tensor %14, %13 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %15 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After ConvertTorchToTMTensor (convert-torch-to-tmtensor) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %int163840 = torch.constant.int 163840
  %int7 = torch.constant.int 7
  %int1 = torch.constant.int 1
  %float1.000000e-05 = torch.constant.float 1.000000e-05
  %true = torch.constant.bool true
  %int3 = torch.constant.int 3
  %int2 = torch.constant.int 2
  %int6 = torch.constant.int 6
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int6, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f32>
  %1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
  %2 = torch.aten.to.dtype %0, %int7, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f64>
  %3 = torch.aten.sum.dim_IntList %2, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64>
  %4 = torch.aten.div.Scalar %3, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64>
  %5 = torch.aten.sub.Tensor %2, %4, %float1.000000e00 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,1,1],f64>, !torch.float -> !torch.vtensor<[2,32,10,16384],f64>
  %6 = torch.aten.mul.Tensor %5, %5 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,10,16384],f64> -> !torch.vtensor<[2,32,10,16384],f64>
  %7 = torch.aten.sum.dim_IntList %6, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64>
  %8 = torch.aten.div.Scalar %7, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64>
  %9 = torch.aten.to.dtype %8, %int6, %false, %false, %none : !torch.vtensor<[2,32,1,1],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32>
  %10 = torch.aten.sum.dim_IntList %0, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32>
  %11 = torch.aten.div.Scalar %10, %int163840 : !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,1,1],f32>
  %12 = torch.aten.add.Scalar %9, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32>
  %13 = torch.aten.rsqrt %12 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32>
  %14 = torch.aten.sub.Tensor %arg1, %11, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
  %15 = torch.aten.mul.Tensor %14, %13 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %15 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After ConvertTMTensorToLinalgExtPass (torch-iree-tm-tensor-to-linalg-ext) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %int163840 = torch.constant.int 163840
  %int7 = torch.constant.int 7
  %int1 = torch.constant.int 1
  %float1.000000e-05 = torch.constant.float 1.000000e-05
  %true = torch.constant.bool true
  %int3 = torch.constant.int 3
  %int2 = torch.constant.int 2
  %int6 = torch.constant.int 6
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int6, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f32>
  %1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
  %2 = torch.aten.to.dtype %0, %int7, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f64>
  %3 = torch.aten.sum.dim_IntList %2, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64>
  %4 = torch.aten.div.Scalar %3, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64>
  %5 = torch.aten.sub.Tensor %2, %4, %float1.000000e00 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,1,1],f64>, !torch.float -> !torch.vtensor<[2,32,10,16384],f64>
  %6 = torch.aten.mul.Tensor %5, %5 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,10,16384],f64> -> !torch.vtensor<[2,32,10,16384],f64>
  %7 = torch.aten.sum.dim_IntList %6, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64>
  %8 = torch.aten.div.Scalar %7, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64>
  %9 = torch.aten.to.dtype %8, %int6, %false, %false, %none : !torch.vtensor<[2,32,1,1],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32>
  %10 = torch.aten.sum.dim_IntList %0, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32>
  %11 = torch.aten.div.Scalar %10, %int163840 : !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,1,1],f32>
  %12 = torch.aten.add.Scalar %9, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32>
  %13 = torch.aten.rsqrt %12 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32>
  %14 = torch.aten.sub.Tensor %arg1, %11, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
  %15 = torch.aten.mul.Tensor %14, %13 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %15 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After ConvertTorchToTensor (convert-torch-to-tensor) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %int163840 = torch.constant.int 163840
  %int7 = torch.constant.int 7
  %int1 = torch.constant.int 1
  %float1.000000e-05 = torch.constant.float 1.000000e-05
  %true = torch.constant.bool true
  %int3 = torch.constant.int 3
  %int2 = torch.constant.int 2
  %int6 = torch.constant.int 6
  %false = torch.constant.bool false
  %none = torch.constant.none
  %0 = torch.aten.to.dtype %arg0, %int6, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f32>
  %1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
  %2 = torch.aten.to.dtype %0, %int7, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f64>
  %3 = torch.aten.sum.dim_IntList %2, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64>
  %4 = torch.aten.div.Scalar %3, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64>
  %5 = torch.aten.sub.Tensor %2, %4, %float1.000000e00 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,1,1],f64>, !torch.float -> !torch.vtensor<[2,32,10,16384],f64>
  %6 = torch.aten.mul.Tensor %5, %5 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,10,16384],f64> -> !torch.vtensor<[2,32,10,16384],f64>
  %7 = torch.aten.sum.dim_IntList %6, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64>
  %8 = torch.aten.div.Scalar %7, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64>
  %9 = torch.aten.to.dtype %8, %int6, %false, %false, %none : !torch.vtensor<[2,32,1,1],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32>
  %10 = torch.aten.sum.dim_IntList %0, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32>
  %11 = torch.aten.div.Scalar %10, %int163840 : !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,1,1],f32>
  %12 = torch.aten.add.Scalar %9, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32>
  %13 = torch.aten.rsqrt %12 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32>
  %14 = torch.aten.sub.Tensor %arg1, %11, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32>
  %15 = torch.aten.mul.Tensor %14, %13 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %15 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After ConvertTorchToLinalg (convert-torch-to-linalg) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %2 = torch_c.to_f64 %float1.000000e00
  %int163840 = torch.constant.int 163840
  %c163840_i64 = arith.constant 163840 : i64
  %int7 = torch.constant.int 7
  %int1 = torch.constant.int 1
  %c1_i64 = arith.constant 1 : i64
  %float1.000000e-05 = torch.constant.float 1.000000e-05
  %3 = torch_c.to_f64 %float1.000000e-05
  %true = torch.constant.bool true
  %int3 = torch.constant.int 3
  %int2 = torch.constant.int 2
  %int6 = torch.constant.int 6
  %false = torch.constant.bool false
  %none = torch.constant.none
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c2 = arith.constant 2 : index
  %c1_0 = arith.constant 1 : index
  %c32 = arith.constant 32 : index
  %c2_1 = arith.constant 2 : index
  %c10 = arith.constant 10 : index
  %c3 = arith.constant 3 : index
  %c16384 = arith.constant 16384 : index
  %4 = tensor.empty() : tensor<2x32x10x16384xf32>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%4 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %39 = arith.extf %in : f16 to f32
    linalg.yield %39 : f32
  } -> tensor<2x32x10x16384xf32>
  %cast = tensor.cast %5 : tensor<2x32x10x16384xf32> to tensor<2x32x10x16384xf32>
  %6 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
  %c1_2 = arith.constant 1 : index
  %c0_3 = arith.constant 0 : index
  %c2_4 = arith.constant 2 : index
  %c1_5 = arith.constant 1 : index
  %c32_6 = arith.constant 32 : index
  %c2_7 = arith.constant 2 : index
  %c10_8 = arith.constant 10 : index
  %c3_9 = arith.constant 3 : index
  %c16384_10 = arith.constant 16384 : index
  %7 = tensor.empty() : tensor<2x32x10x16384xf64>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast : tensor<2x32x10x16384xf32>) outs(%7 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f32, %out: f64):
    %39 = arith.extf %in : f32 to f64
    linalg.yield %39 : f64
  } -> tensor<2x32x10x16384xf64>
  %cast_11 = tensor.cast %8 : tensor<2x32x10x16384xf64> to tensor<2x32x10x16384xf64>
  %cst = arith.constant 0.000000e+00 : f64
  %c1_12 = arith.constant 1 : index
  %c0_13 = arith.constant 0 : index
  %dim = tensor.dim %cast_11, %c0_13 : tensor<2x32x10x16384xf64>
  %c1_14 = arith.constant 1 : index
  %dim_15 = tensor.dim %cast_11, %c1_14 : tensor<2x32x10x16384xf64>
  %c2_16 = arith.constant 2 : index
  %dim_17 = tensor.dim %cast_11, %c2_16 : tensor<2x32x10x16384xf64>
  %c3_18 = arith.constant 3 : index
  %dim_19 = tensor.dim %cast_11, %c3_18 : tensor<2x32x10x16384xf64>
  %9 = tensor.empty(%dim, %dim_15) : tensor<?x?x1x1xf64>
  %10 = linalg.fill ins(%cst : f64) outs(%9 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%cast_11 : tensor<2x32x10x16384xf64>) outs(%10 : tensor<?x?x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %39 = arith.addf %in, %out : f64
    linalg.yield %39 : f64
  } -> tensor<?x?x1x1xf64>
  %cast_20 = tensor.cast %11 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64>
  %c1_21 = arith.constant 1 : index
  %c0_22 = arith.constant 0 : index
  %c2_23 = arith.constant 2 : index
  %c1_24 = arith.constant 1 : index
  %c32_25 = arith.constant 32 : index
  %c2_26 = arith.constant 2 : index
  %c1_27 = arith.constant 1 : index
  %c3_28 = arith.constant 3 : index
  %c1_29 = arith.constant 1 : index
  %12 = tensor.empty() : tensor<2x32x1x1xf64>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_20 : tensor<2x32x1x1xf64>) outs(%12 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %39 = arith.sitofp %c163840_i64 : i64 to f64
    %40 = arith.divf %in, %39 : f64
    linalg.yield %40 : f64
  } -> tensor<2x32x1x1xf64>
  %cast_30 = tensor.cast %13 : tensor<2x32x1x1xf64> to tensor<2x32x1x1xf64>
  %c1_31 = arith.constant 1 : index
  %c0_32 = arith.constant 0 : index
  %c2_33 = arith.constant 2 : index
  %c1_34 = arith.constant 1 : index
  %c32_35 = arith.constant 32 : index
  %c2_36 = arith.constant 2 : index
  %c10_37 = arith.constant 10 : index
  %c3_38 = arith.constant 3 : index
  %c16384_39 = arith.constant 16384 : index
  %c0_40 = arith.constant 0 : index
  %c2_41 = arith.constant 2 : index
  %c1_42 = arith.constant 1 : index
  %c32_43 = arith.constant 32 : index
  %14 = tensor.empty() : tensor<2x32x10x16384xf64>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_11, %cast_30 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%14 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_163: f64, %out: f64):
    %39 = arith.mulf %in_163, %2 : f64
    %40 = arith.subf %in, %39 : f64
    linalg.yield %40 : f64
  } -> tensor<2x32x10x16384xf64>
  %cast_44 = tensor.cast %15 : tensor<2x32x10x16384xf64> to tensor<2x32x10x16384xf64>
  %c1_45 = arith.constant 1 : index
  %c0_46 = arith.constant 0 : index
  %c2_47 = arith.constant 2 : index
  %c1_48 = arith.constant 1 : index
  %c32_49 = arith.constant 32 : index
  %c2_50 = arith.constant 2 : index
  %c10_51 = arith.constant 10 : index
  %c3_52 = arith.constant 3 : index
  %c16384_53 = arith.constant 16384 : index
  %c0_54 = arith.constant 0 : index
  %c2_55 = arith.constant 2 : index
  %c1_56 = arith.constant 1 : index
  %c32_57 = arith.constant 32 : index
  %c2_58 = arith.constant 2 : index
  %c10_59 = arith.constant 10 : index
  %c3_60 = arith.constant 3 : index
  %c16384_61 = arith.constant 16384 : index
  %16 = tensor.empty() : tensor<2x32x10x16384xf64>
  %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_44, %cast_44 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%16 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_163: f64, %out: f64):
    %39 = arith.mulf %in, %in_163 : f64
    linalg.yield %39 : f64
  } -> tensor<2x32x10x16384xf64>
  %cast_62 = tensor.cast %17 : tensor<2x32x10x16384xf64> to tensor<2x32x10x16384xf64>
  %cst_63 = arith.constant 0.000000e+00 : f64
  %c1_64 = arith.constant 1 : index
  %c0_65 = arith.constant 0 : index
  %dim_66 = tensor.dim %cast_62, %c0_65 : tensor<2x32x10x16384xf64>
  %c1_67 = arith.constant 1 : index
  %dim_68 = tensor.dim %cast_62, %c1_67 : tensor<2x32x10x16384xf64>
  %c2_69 = arith.constant 2 : index
  %dim_70 = tensor.dim %cast_62, %c2_69 : tensor<2x32x10x16384xf64>
  %c3_71 = arith.constant 3 : index
  %dim_72 = tensor.dim %cast_62, %c3_71 : tensor<2x32x10x16384xf64>
  %18 = tensor.empty(%dim_66, %dim_68) : tensor<?x?x1x1xf64>
  %19 = linalg.fill ins(%cst_63 : f64) outs(%18 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%cast_62 : tensor<2x32x10x16384xf64>) outs(%19 : tensor<?x?x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %39 = arith.addf %in, %out : f64
    linalg.yield %39 : f64
  } -> tensor<?x?x1x1xf64>
  %cast_73 = tensor.cast %20 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64>
  %c1_74 = arith.constant 1 : index
  %c0_75 = arith.constant 0 : index
  %c2_76 = arith.constant 2 : index
  %c1_77 = arith.constant 1 : index
  %c32_78 = arith.constant 32 : index
  %c2_79 = arith.constant 2 : index
  %c1_80 = arith.constant 1 : index
  %c3_81 = arith.constant 3 : index
  %c1_82 = arith.constant 1 : index
  %21 = tensor.empty() : tensor<2x32x1x1xf64>
  %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_73 : tensor<2x32x1x1xf64>) outs(%21 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %39 = arith.sitofp %c163840_i64 : i64 to f64
    %40 = arith.divf %in, %39 : f64
    linalg.yield %40 : f64
  } -> tensor<2x32x1x1xf64>
  %cast_83 = tensor.cast %22 : tensor<2x32x1x1xf64> to tensor<2x32x1x1xf64>
  %c1_84 = arith.constant 1 : index
  %c0_85 = arith.constant 0 : index
  %c2_86 = arith.constant 2 : index
  %c1_87 = arith.constant 1 : index
  %c32_88 = arith.constant 32 : index
  %c2_89 = arith.constant 2 : index
  %c1_90 = arith.constant 1 : index
  %c3_91 = arith.constant 3 : index
  %c1_92 = arith.constant 1 : index
  %23 = tensor.empty() : tensor<2x32x1x1xf32>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_83 : tensor<2x32x1x1xf64>) outs(%23 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f64, %out: f32):
    %39 = arith.truncf %in : f64 to f32
    linalg.yield %39 : f32
  } -> tensor<2x32x1x1xf32>
  %cast_93 = tensor.cast %24 : tensor<2x32x1x1xf32> to tensor<2x32x1x1xf32>
  %cst_94 = arith.constant 0.000000e+00 : f32
  %c1_95 = arith.constant 1 : index
  %c0_96 = arith.constant 0 : index
  %dim_97 = tensor.dim %cast, %c0_96 : tensor<2x32x10x16384xf32>
  %c1_98 = arith.constant 1 : index
  %dim_99 = tensor.dim %cast, %c1_98 : tensor<2x32x10x16384xf32>
  %c2_100 = arith.constant 2 : index
  %dim_101 = tensor.dim %cast, %c2_100 : tensor<2x32x10x16384xf32>
  %c3_102 = arith.constant 3 : index
  %dim_103 = tensor.dim %cast, %c3_102 : tensor<2x32x10x16384xf32>
  %25 = tensor.empty(%dim_97, %dim_99) : tensor<?x?x1x1xf32>
  %26 = linalg.fill ins(%cst_94 : f32) outs(%25 : tensor<?x?x1x1xf32>) -> tensor<?x?x1x1xf32>
  %27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%cast : tensor<2x32x10x16384xf32>) outs(%26 : tensor<?x?x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %39 = arith.addf %in, %out : f32
    linalg.yield %39 : f32
  } -> tensor<?x?x1x1xf32>
  %cast_104 = tensor.cast %27 : tensor<?x?x1x1xf32> to tensor<2x32x1x1xf32>
  %c1_105 = arith.constant 1 : index
  %c0_106 = arith.constant 0 : index
  %c2_107 = arith.constant 2 : index
  %c1_108 = arith.constant 1 : index
  %c32_109 = arith.constant 32 : index
  %c2_110 = arith.constant 2 : index
  %c1_111 = arith.constant 1 : index
  %c3_112 = arith.constant 3 : index
  %c1_113 = arith.constant 1 : index
  %28 = tensor.empty() : tensor<2x32x1x1xf32>
  %29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_104 : tensor<2x32x1x1xf32>) outs(%28 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %39 = arith.sitofp %c163840_i64 : i64 to f32
    %40 = arith.divf %in, %39 : f32
    linalg.yield %40 : f32
  } -> tensor<2x32x1x1xf32>
  %cast_114 = tensor.cast %29 : tensor<2x32x1x1xf32> to tensor<2x32x1x1xf32>
  %c1_115 = arith.constant 1 : index
  %c0_116 = arith.constant 0 : index
  %c2_117 = arith.constant 2 : index
  %c1_118 = arith.constant 1 : index
  %c32_119 = arith.constant 32 : index
  %c2_120 = arith.constant 2 : index
  %c1_121 = arith.constant 1 : index
  %c3_122 = arith.constant 3 : index
  %c1_123 = arith.constant 1 : index
  %30 = tensor.empty() : tensor<2x32x1x1xf32>
  %31 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_93 : tensor<2x32x1x1xf32>) outs(%30 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %39 = arith.truncf %3 : f64 to f32
    %40 = arith.sitofp %c1_i64 : i64 to f32
    %41 = arith.mulf %39, %40 : f32
    %42 = arith.addf %in, %41 : f32
    linalg.yield %42 : f32
  } -> tensor<2x32x1x1xf32>
  %cast_124 = tensor.cast %31 : tensor<2x32x1x1xf32> to tensor<2x32x1x1xf32>
  %c1_125 = arith.constant 1 : index
  %c0_126 = arith.constant 0 : index
  %c2_127 = arith.constant 2 : index
  %c1_128 = arith.constant 1 : index
  %c32_129 = arith.constant 32 : index
  %c2_130 = arith.constant 2 : index
  %c1_131 = arith.constant 1 : index
  %c3_132 = arith.constant 3 : index
  %c1_133 = arith.constant 1 : index
  %32 = tensor.empty() : tensor<2x32x1x1xf32>
  %33 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_124 : tensor<2x32x1x1xf32>) outs(%32 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %39 = math.rsqrt %in : f32
    linalg.yield %39 : f32
  } -> tensor<2x32x1x1xf32>
  %cast_134 = tensor.cast %33 : tensor<2x32x1x1xf32> to tensor<2x32x1x1xf32>
  %c1_135 = arith.constant 1 : index
  %c0_136 = arith.constant 0 : index
  %c2_137 = arith.constant 2 : index
  %c1_138 = arith.constant 1 : index
  %c32_139 = arith.constant 32 : index
  %c2_140 = arith.constant 2 : index
  %c10_141 = arith.constant 10 : index
  %c3_142 = arith.constant 3 : index
  %c16384_143 = arith.constant 16384 : index
  %c0_144 = arith.constant 0 : index
  %c2_145 = arith.constant 2 : index
  %c1_146 = arith.constant 1 : index
  %c32_147 = arith.constant 32 : index
  %34 = tensor.empty() : tensor<2x32x10x16384xf32>
  %35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %cast_114 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%34 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_163: f32, %out: f32):
    %39 = arith.extf %in : f16 to f32
    %40 = arith.sitofp %c1_i64 : i64 to f32
    %41 = arith.mulf %in_163, %40 : f32
    %42 = arith.subf %39, %41 : f32
    linalg.yield %42 : f32
  } -> tensor<2x32x10x16384xf32>
  %cast_148 = tensor.cast %35 : tensor<2x32x10x16384xf32> to tensor<2x32x10x16384xf32>
  %c1_149 = arith.constant 1 : index
  %c0_150 = arith.constant 0 : index
  %c2_151 = arith.constant 2 : index
  %c1_152 = arith.constant 1 : index
  %c32_153 = arith.constant 32 : index
  %c2_154 = arith.constant 2 : index
  %c10_155 = arith.constant 10 : index
  %c3_156 = arith.constant 3 : index
  %c16384_157 = arith.constant 16384 : index
  %c0_158 = arith.constant 0 : index
  %c2_159 = arith.constant 2 : index
  %c1_160 = arith.constant 1 : index
  %c32_161 = arith.constant 32 : index
  %36 = tensor.empty() : tensor<2x32x10x16384xf32>
  %37 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_148, %cast_134 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%36 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_163: f32, %out: f32):
    %39 = arith.mulf %in, %in_163 : f32
    linalg.yield %39 : f32
  } -> tensor<2x32x10x16384xf32>
  %cast_162 = tensor.cast %37 : tensor<2x32x10x16384xf32> to tensor<2x32x10x16384xf32>
  %38 = torch_c.from_builtin_tensor %cast_162 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %38 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %2 = torch_c.to_f64 %float1.000000e00
  %c163840_i64 = arith.constant 163840 : i64
  %c1_i64 = arith.constant 1 : i64
  %float1.000000e-05 = torch.constant.float 1.000000e-05
  %3 = torch_c.to_f64 %float1.000000e-05
  %int3 = torch.constant.int 3
  %int2 = torch.constant.int 2
  %4 = tensor.empty() : tensor<2x32x10x16384xf32>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%4 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %30 = arith.extf %in : f16 to f32
    linalg.yield %30 : f32
  } -> tensor<2x32x10x16384xf32>
  %cast = tensor.cast %5 : tensor<2x32x10x16384xf32> to tensor<2x32x10x16384xf32>
  %6 = tensor.empty() : tensor<2x32x10x16384xf64>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast : tensor<2x32x10x16384xf32>) outs(%6 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f32, %out: f64):
    %30 = arith.extf %in : f32 to f64
    linalg.yield %30 : f64
  } -> tensor<2x32x10x16384xf64>
  %cast_0 = tensor.cast %7 : tensor<2x32x10x16384xf64> to tensor<2x32x10x16384xf64>
  %cst = arith.constant 0.000000e+00 : f64
  %c0 = arith.constant 0 : index
  %dim = tensor.dim %cast_0, %c0 : tensor<2x32x10x16384xf64>
  %c1 = arith.constant 1 : index
  %dim_1 = tensor.dim %cast_0, %c1 : tensor<2x32x10x16384xf64>
  %c2 = arith.constant 2 : index
  %c3 = arith.constant 3 : index
  %8 = tensor.empty(%dim, %dim_1) : tensor<?x?x1x1xf64>
  %9 = linalg.fill ins(%cst : f64) outs(%8 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%cast_0 : tensor<2x32x10x16384xf64>) outs(%9 : tensor<?x?x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %30 = arith.addf %in, %out : f64
    linalg.yield %30 : f64
  } -> tensor<?x?x1x1xf64>
  %cast_2 = tensor.cast %10 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64>
  %11 = tensor.empty() : tensor<2x32x1x1xf64>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_2 : tensor<2x32x1x1xf64>) outs(%11 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %30 = arith.sitofp %c163840_i64 : i64 to f64
    %31 = arith.divf %in, %30 : f64
    linalg.yield %31 : f64
  } -> tensor<2x32x1x1xf64>
  %cast_3 = tensor.cast %12 : tensor<2x32x1x1xf64> to tensor<2x32x1x1xf64>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_0, %cast_3 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_20: f64, %out: f64):
    %30 = arith.mulf %in_20, %2 : f64
    %31 = arith.subf %in, %30 : f64
    linalg.yield %31 : f64
  } -> tensor<2x32x10x16384xf64>
  %cast_4 = tensor.cast %13 : tensor<2x32x10x16384xf64> to tensor<2x32x10x16384xf64>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_4, %cast_4 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%6 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_20: f64, %out: f64):
    %30 = arith.mulf %in, %in_20 : f64
    linalg.yield %30 : f64
  } -> tensor<2x32x10x16384xf64>
  %cast_5 = tensor.cast %14 : tensor<2x32x10x16384xf64> to tensor<2x32x10x16384xf64>
  %dim_6 = tensor.dim %cast_5, %c0 : tensor<2x32x10x16384xf64>
  %dim_7 = tensor.dim %cast_5, %c1 : tensor<2x32x10x16384xf64>
  %15 = tensor.empty(%dim_6, %dim_7) : tensor<?x?x1x1xf64>
  %16 = linalg.fill ins(%cst : f64) outs(%15 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64>
  %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%cast_5 : tensor<2x32x10x16384xf64>) outs(%16 : tensor<?x?x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %30 = arith.addf %in, %out : f64
    linalg.yield %30 : f64
  } -> tensor<?x?x1x1xf64>
  %cast_8 = tensor.cast %17 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_8 : tensor<2x32x1x1xf64>) outs(%11 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %30 = arith.sitofp %c163840_i64 : i64 to f64
    %31 = arith.divf %in, %30 : f64
    linalg.yield %31 : f64
  } -> tensor<2x32x1x1xf64>
  %cast_9 = tensor.cast %18 : tensor<2x32x1x1xf64> to tensor<2x32x1x1xf64>
  %19 = tensor.empty() : tensor<2x32x1x1xf32>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_9 : tensor<2x32x1x1xf64>) outs(%19 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f64, %out: f32):
    %30 = arith.truncf %in : f64 to f32
    linalg.yield %30 : f32
  } -> tensor<2x32x1x1xf32>
  %cast_10 = tensor.cast %20 : tensor<2x32x1x1xf32> to tensor<2x32x1x1xf32>
  %cst_11 = arith.constant 0.000000e+00 : f32
  %dim_12 = tensor.dim %cast, %c0 : tensor<2x32x10x16384xf32>
  %dim_13 = tensor.dim %cast, %c1 : tensor<2x32x10x16384xf32>
  %21 = tensor.empty(%dim_12, %dim_13) : tensor<?x?x1x1xf32>
  %22 = linalg.fill ins(%cst_11 : f32) outs(%21 : tensor<?x?x1x1xf32>) -> tensor<?x?x1x1xf32>
  %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%cast : tensor<2x32x10x16384xf32>) outs(%22 : tensor<?x?x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %30 = arith.addf %in, %out : f32
    linalg.yield %30 : f32
  } -> tensor<?x?x1x1xf32>
  %cast_14 = tensor.cast %23 : tensor<?x?x1x1xf32> to tensor<2x32x1x1xf32>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_14 : tensor<2x32x1x1xf32>) outs(%19 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %30 = arith.sitofp %c163840_i64 : i64 to f32
    %31 = arith.divf %in, %30 : f32
    linalg.yield %31 : f32
  } -> tensor<2x32x1x1xf32>
  %cast_15 = tensor.cast %24 : tensor<2x32x1x1xf32> to tensor<2x32x1x1xf32>
  %25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_10 : tensor<2x32x1x1xf32>) outs(%19 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %30 = arith.truncf %3 : f64 to f32
    %31 = arith.sitofp %c1_i64 : i64 to f32
    %32 = arith.mulf %30, %31 : f32
    %33 = arith.addf %in, %32 : f32
    linalg.yield %33 : f32
  } -> tensor<2x32x1x1xf32>
  %cast_16 = tensor.cast %25 : tensor<2x32x1x1xf32> to tensor<2x32x1x1xf32>
  %26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_16 : tensor<2x32x1x1xf32>) outs(%19 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %30 = math.rsqrt %in : f32
    linalg.yield %30 : f32
  } -> tensor<2x32x1x1xf32>
  %cast_17 = tensor.cast %26 : tensor<2x32x1x1xf32> to tensor<2x32x1x1xf32>
  %27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %cast_15 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_20: f32, %out: f32):
    %30 = arith.extf %in : f16 to f32
    %31 = arith.sitofp %c1_i64 : i64 to f32
    %32 = arith.mulf %in_20, %31 : f32
    %33 = arith.subf %30, %32 : f32
    linalg.yield %33 : f32
  } -> tensor<2x32x10x16384xf32>
  %cast_18 = tensor.cast %27 : tensor<2x32x10x16384xf32> to tensor<2x32x10x16384xf32>
  %28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_18, %cast_17 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_20: f32, %out: f32):
    %30 = arith.mulf %in, %in_20 : f32
    linalg.yield %30 : f32
  } -> tensor<2x32x10x16384xf32>
  %cast_19 = tensor.cast %28 : tensor<2x32x10x16384xf32> to tensor<2x32x10x16384xf32>
  %29 = torch_c.from_builtin_tensor %cast_19 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %29 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After ConvertTorchToSCF (convert-torch-to-scf) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
  %float1.000000e00 = torch.constant.float 1.000000e+00
  %2 = torch_c.to_f64 %float1.000000e00
  %c163840_i64 = arith.constant 163840 : i64
  %c1_i64 = arith.constant 1 : i64
  %float1.000000e-05 = torch.constant.float 1.000000e-05
  %3 = torch_c.to_f64 %float1.000000e-05
  %int3 = torch.constant.int 3
  %int2 = torch.constant.int 2
  %4 = tensor.empty() : tensor<2x32x10x16384xf32>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%4 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %30 = arith.extf %in : f16 to f32
    linalg.yield %30 : f32
  } -> tensor<2x32x10x16384xf32>
  %6 = tensor.empty() : tensor<2x32x10x16384xf64>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf32>) outs(%6 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f32, %out: f64):
    %30 = arith.extf %in : f32 to f64
    linalg.yield %30 : f64
  } -> tensor<2x32x10x16384xf64>
  %cst = arith.constant 0.000000e+00 : f64
  %c0 = arith.constant 0 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c32 = arith.constant 32 : index
  %c2_0 = arith.constant 2 : index
  %c3 = arith.constant 3 : index
  %8 = tensor.empty(%c2, %c32) : tensor<?x?x1x1xf64>
  %9 = linalg.fill ins(%cst : f64) outs(%8 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf64>) outs(%9 : tensor<?x?x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %30 = arith.addf %in, %out : f64
    linalg.yield %30 : f64
  } -> tensor<?x?x1x1xf64>
  %cast = tensor.cast %10 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64>
  %11 = tensor.empty() : tensor<2x32x1x1xf64>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast : tensor<2x32x1x1xf64>) outs(%11 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %30 = arith.sitofp %c163840_i64 : i64 to f64
    %31 = arith.divf %in, %30 : f64
    linalg.yield %31 : f64
  } -> tensor<2x32x1x1xf64>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7, %12 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_8: f64, %out: f64):
    %30 = arith.mulf %in_8, %2 : f64
    %31 = arith.subf %in, %30 : f64
    linalg.yield %31 : f64
  } -> tensor<2x32x10x16384xf64>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13, %13 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%6 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_8: f64, %out: f64):
    %30 = arith.mulf %in, %in_8 : f64
    linalg.yield %30 : f64
  } -> tensor<2x32x10x16384xf64>
  %c2_1 = arith.constant 2 : index
  %c32_2 = arith.constant 32 : index
  %15 = tensor.empty(%c2_1, %c32_2) : tensor<?x?x1x1xf64>
  %16 = linalg.fill ins(%cst : f64) outs(%15 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64>
  %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%14 : tensor<2x32x10x16384xf64>) outs(%16 : tensor<?x?x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %30 = arith.addf %in, %out : f64
    linalg.yield %30 : f64
  } -> tensor<?x?x1x1xf64>
  %cast_3 = tensor.cast %17 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_3 : tensor<2x32x1x1xf64>) outs(%11 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %30 = arith.sitofp %c163840_i64 : i64 to f64
    %31 = arith.divf %in, %30 : f64
    linalg.yield %31 : f64
  } -> tensor<2x32x1x1xf64>
  %19 = tensor.empty() : tensor<2x32x1x1xf32>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18 : tensor<2x32x1x1xf64>) outs(%19 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f64, %out: f32):
    %30 = arith.truncf %in : f64 to f32
    linalg.yield %30 : f32
  } -> tensor<2x32x1x1xf32>
  %cst_4 = arith.constant 0.000000e+00 : f32
  %c2_5 = arith.constant 2 : index
  %c32_6 = arith.constant 32 : index
  %21 = tensor.empty(%c2_5, %c32_6) : tensor<?x?x1x1xf32>
  %22 = linalg.fill ins(%cst_4 : f32) outs(%21 : tensor<?x?x1x1xf32>) -> tensor<?x?x1x1xf32>
  %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf32>) outs(%22 : tensor<?x?x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %30 = arith.addf %in, %out : f32
    linalg.yield %30 : f32
  } -> tensor<?x?x1x1xf32>
  %cast_7 = tensor.cast %23 : tensor<?x?x1x1xf32> to tensor<2x32x1x1xf32>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_7 : tensor<2x32x1x1xf32>) outs(%19 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %30 = arith.sitofp %c163840_i64 : i64 to f32
    %31 = arith.divf %in, %30 : f32
    linalg.yield %31 : f32
  } -> tensor<2x32x1x1xf32>
  %25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%20 : tensor<2x32x1x1xf32>) outs(%19 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %30 = arith.truncf %3 : f64 to f32
    %31 = arith.sitofp %c1_i64 : i64 to f32
    %32 = arith.mulf %30, %31 : f32
    %33 = arith.addf %in, %32 : f32
    linalg.yield %33 : f32
  } -> tensor<2x32x1x1xf32>
  %26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%25 : tensor<2x32x1x1xf32>) outs(%19 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %30 = math.rsqrt %in : f32
    linalg.yield %30 : f32
  } -> tensor<2x32x1x1xf32>
  %27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %24 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_8: f32, %out: f32):
    %30 = arith.extf %in : f16 to f32
    %31 = arith.sitofp %c1_i64 : i64 to f32
    %32 = arith.mulf %in_8, %31 : f32
    %33 = arith.subf %30, %32 : f32
    linalg.yield %33 : f32
  } -> tensor<2x32x10x16384xf32>
  %28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%27, %26 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_8: f32, %out: f32):
    %30 = arith.mulf %in, %in_8 : f32
    linalg.yield %30 : f32
  } -> tensor<2x32x10x16384xf32>
  %29 = torch_c.from_builtin_tensor %28 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %29 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After ConvertTorchToArith (convert-torch-to-arith) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
  %cst = arith.constant 1.000000e+00 : f64
  %2 = torch_c.from_f64 %cst
  %3 = torch_c.to_f64 %2
  %c163840_i64 = arith.constant 163840 : i64
  %c1_i64 = arith.constant 1 : i64
  %cst_0 = arith.constant 1.000000e-05 : f64
  %4 = torch_c.from_f64 %cst_0
  %5 = torch_c.to_f64 %4
  %c3_i64 = arith.constant 3 : i64
  %c2_i64 = arith.constant 2 : i64
  %6 = tensor.empty() : tensor<2x32x10x16384xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %32 = arith.extf %in : f16 to f32
    linalg.yield %32 : f32
  } -> tensor<2x32x10x16384xf32>
  %8 = tensor.empty() : tensor<2x32x10x16384xf64>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%8 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f32, %out: f64):
    %32 = arith.extf %in : f32 to f64
    linalg.yield %32 : f64
  } -> tensor<2x32x10x16384xf64>
  %cst_1 = arith.constant 0.000000e+00 : f64
  %c0 = arith.constant 0 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c32 = arith.constant 32 : index
  %c2_2 = arith.constant 2 : index
  %c3 = arith.constant 3 : index
  %10 = tensor.empty(%c2, %c32) : tensor<?x?x1x1xf64>
  %11 = linalg.fill ins(%cst_1 : f64) outs(%10 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf64>) outs(%11 : tensor<?x?x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %32 = arith.addf %in, %out : f64
    linalg.yield %32 : f64
  } -> tensor<?x?x1x1xf64>
  %cast = tensor.cast %12 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64>
  %13 = tensor.empty() : tensor<2x32x1x1xf64>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast : tensor<2x32x1x1xf64>) outs(%13 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %32 = arith.sitofp %c163840_i64 : i64 to f64
    %33 = arith.divf %in, %32 : f64
    linalg.yield %33 : f64
  } -> tensor<2x32x1x1xf64>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %14 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%8 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_10: f64, %out: f64):
    %32 = arith.mulf %in_10, %3 : f64
    %33 = arith.subf %in, %32 : f64
    linalg.yield %33 : f64
  } -> tensor<2x32x10x16384xf64>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15, %15 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%8 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_10: f64, %out: f64):
    %32 = arith.mulf %in, %in_10 : f64
    linalg.yield %32 : f64
  } -> tensor<2x32x10x16384xf64>
  %c2_3 = arith.constant 2 : index
  %c32_4 = arith.constant 32 : index
  %17 = tensor.empty(%c2_3, %c32_4) : tensor<?x?x1x1xf64>
  %18 = linalg.fill ins(%cst_1 : f64) outs(%17 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64>
  %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%16 : tensor<2x32x10x16384xf64>) outs(%18 : tensor<?x?x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %32 = arith.addf %in, %out : f64
    linalg.yield %32 : f64
  } -> tensor<?x?x1x1xf64>
  %cast_5 = tensor.cast %19 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_5 : tensor<2x32x1x1xf64>) outs(%13 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %32 = arith.sitofp %c163840_i64 : i64 to f64
    %33 = arith.divf %in, %32 : f64
    linalg.yield %33 : f64
  } -> tensor<2x32x1x1xf64>
  %21 = tensor.empty() : tensor<2x32x1x1xf32>
  %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%20 : tensor<2x32x1x1xf64>) outs(%21 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f64, %out: f32):
    %32 = arith.truncf %in : f64 to f32
    linalg.yield %32 : f32
  } -> tensor<2x32x1x1xf32>
  %cst_6 = arith.constant 0.000000e+00 : f32
  %c2_7 = arith.constant 2 : index
  %c32_8 = arith.constant 32 : index
  %23 = tensor.empty(%c2_7, %c32_8) : tensor<?x?x1x1xf32>
  %24 = linalg.fill ins(%cst_6 : f32) outs(%23 : tensor<?x?x1x1xf32>) -> tensor<?x?x1x1xf32>
  %25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%24 : tensor<?x?x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %32 = arith.addf %in, %out : f32
    linalg.yield %32 : f32
  } -> tensor<?x?x1x1xf32>
  %cast_9 = tensor.cast %25 : tensor<?x?x1x1xf32> to tensor<2x32x1x1xf32>
  %26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_9 : tensor<2x32x1x1xf32>) outs(%21 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %32 = arith.sitofp %c163840_i64 : i64 to f32
    %33 = arith.divf %in, %32 : f32
    linalg.yield %33 : f32
  } -> tensor<2x32x1x1xf32>
  %27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22 : tensor<2x32x1x1xf32>) outs(%21 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %32 = arith.truncf %5 : f64 to f32
    %33 = arith.sitofp %c1_i64 : i64 to f32
    %34 = arith.mulf %32, %33 : f32
    %35 = arith.addf %in, %34 : f32
    linalg.yield %35 : f32
  } -> tensor<2x32x1x1xf32>
  %28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%27 : tensor<2x32x1x1xf32>) outs(%21 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %32 = math.rsqrt %in : f32
    linalg.yield %32 : f32
  } -> tensor<2x32x1x1xf32>
  %29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %26 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_10: f32, %out: f32):
    %32 = arith.extf %in : f16 to f32
    %33 = arith.sitofp %c1_i64 : i64 to f32
    %34 = arith.mulf %in_10, %33 : f32
    %35 = arith.subf %32, %34 : f32
    linalg.yield %35 : f32
  } -> tensor<2x32x10x16384xf32>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %28 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_10: f32, %out: f32):
    %32 = arith.mulf %in, %in_10 : f32
    linalg.yield %32 : f32
  } -> tensor<2x32x10x16384xf32>
  %31 = torch_c.from_builtin_tensor %30 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %31 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After ConvertTorchConversionToMLProgram (convert-torch-conversion-to-mlprogram) //----- //
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>
 module {
  func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
    %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
    %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
    %cst = arith.constant 1.000000e+00 : f64
    %2 = torch_c.from_f64 %cst
    %3 = torch_c.to_f64 %2
    %c163840_i64 = arith.constant 163840 : i64
    %c1_i64 = arith.constant 1 : i64
    %cst_0 = arith.constant 1.000000e-05 : f64
    %4 = torch_c.from_f64 %cst_0
    %5 = torch_c.to_f64 %4
    %c3_i64 = arith.constant 3 : i64
    %c2_i64 = arith.constant 2 : i64
    %6 = tensor.empty() : tensor<2x32x10x16384xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %32 = arith.extf %in : f16 to f32
      linalg.yield %32 : f32
    } -> tensor<2x32x10x16384xf32>
    %8 = tensor.empty() : tensor<2x32x10x16384xf64>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%8 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f32, %out: f64):
      %32 = arith.extf %in : f32 to f64
      linalg.yield %32 : f64
    } -> tensor<2x32x10x16384xf64>
    %cst_1 = arith.constant 0.000000e+00 : f64
    %c0 = arith.constant 0 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c32 = arith.constant 32 : index
    %c2_2 = arith.constant 2 : index
    %c3 = arith.constant 3 : index
    %10 = tensor.empty(%c2, %c32) : tensor<?x?x1x1xf64>
    %11 = linalg.fill ins(%cst_1 : f64) outs(%10 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64>
    %12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf64>) outs(%11 : tensor<?x?x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %32 = arith.addf %in, %out : f64
      linalg.yield %32 : f64
    } -> tensor<?x?x1x1xf64>
    %cast = tensor.cast %12 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64>
    %13 = tensor.empty() : tensor<2x32x1x1xf64>
    %14 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast : tensor<2x32x1x1xf64>) outs(%13 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %32 = arith.sitofp %c163840_i64 : i64 to f64
      %33 = arith.divf %in, %32 : f64
      linalg.yield %33 : f64
    } -> tensor<2x32x1x1xf64>
    %15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %14 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%8 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f64, %in_10: f64, %out: f64):
      %32 = arith.mulf %in_10, %3 : f64
      %33 = arith.subf %in, %32 : f64
      linalg.yield %33 : f64
    } -> tensor<2x32x10x16384xf64>
    %16 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15, %15 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%8 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f64, %in_10: f64, %out: f64):
      %32 = arith.mulf %in, %in_10 : f64
      linalg.yield %32 : f64
    } -> tensor<2x32x10x16384xf64>
    %c2_3 = arith.constant 2 : index
    %c32_4 = arith.constant 32 : index
    %17 = tensor.empty(%c2_3, %c32_4) : tensor<?x?x1x1xf64>
    %18 = linalg.fill ins(%cst_1 : f64) outs(%17 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64>
    %19 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%16 : tensor<2x32x10x16384xf64>) outs(%18 : tensor<?x?x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %32 = arith.addf %in, %out : f64
      linalg.yield %32 : f64
    } -> tensor<?x?x1x1xf64>
    %cast_5 = tensor.cast %19 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64>
    %20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_5 : tensor<2x32x1x1xf64>) outs(%13 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %32 = arith.sitofp %c163840_i64 : i64 to f64
      %33 = arith.divf %in, %32 : f64
      linalg.yield %33 : f64
    } -> tensor<2x32x1x1xf64>
    %21 = tensor.empty() : tensor<2x32x1x1xf32>
    %22 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%20 : tensor<2x32x1x1xf64>) outs(%21 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f64, %out: f32):
      %32 = arith.truncf %in : f64 to f32
      linalg.yield %32 : f32
    } -> tensor<2x32x1x1xf32>
    %cst_6 = arith.constant 0.000000e+00 : f32
    %c2_7 = arith.constant 2 : index
    %c32_8 = arith.constant 32 : index
    %23 = tensor.empty(%c2_7, %c32_8) : tensor<?x?x1x1xf32>
    %24 = linalg.fill ins(%cst_6 : f32) outs(%23 : tensor<?x?x1x1xf32>) -> tensor<?x?x1x1xf32>
    %25 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%24 : tensor<?x?x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %32 = arith.addf %in, %out : f32
      linalg.yield %32 : f32
    } -> tensor<?x?x1x1xf32>
    %cast_9 = tensor.cast %25 : tensor<?x?x1x1xf32> to tensor<2x32x1x1xf32>
    %26 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_9 : tensor<2x32x1x1xf32>) outs(%21 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %32 = arith.sitofp %c163840_i64 : i64 to f32
      %33 = arith.divf %in, %32 : f32
      linalg.yield %33 : f32
    } -> tensor<2x32x1x1xf32>
    %27 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22 : tensor<2x32x1x1xf32>) outs(%21 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %32 = arith.truncf %5 : f64 to f32
      %33 = arith.sitofp %c1_i64 : i64 to f32
      %34 = arith.mulf %32, %33 : f32
      %35 = arith.addf %in, %34 : f32
      linalg.yield %35 : f32
    } -> tensor<2x32x1x1xf32>
    %28 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%27 : tensor<2x32x1x1xf32>) outs(%21 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %32 = math.rsqrt %in : f32
      linalg.yield %32 : f32
    } -> tensor<2x32x1x1xf32>
    %29 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %26 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_10: f32, %out: f32):
      %32 = arith.extf %in : f16 to f32
      %33 = arith.sitofp %c1_i64 : i64 to f32
      %34 = arith.mulf %in_10, %33 : f32
      %35 = arith.subf %32, %34 : f32
      linalg.yield %35 : f32
    } -> tensor<2x32x10x16384xf32>
    %30 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %28 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_10: f32, %out: f32):
      %32 = arith.mulf %in, %in_10 : f32
      linalg.yield %32 : f32
    } -> tensor<2x32x10x16384xf32>
    %31 = torch_c.from_builtin_tensor %30 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32>
    return %31 : !torch.vtensor<[2,32,10,16384],f32>
  }
 }


 // -----// IR Dump After ExpandOps (memref-expand) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
  %cst = arith.constant 1.000000e+00 : f64
  %2 = torch_c.from_f64 %cst
  %3 = torch_c.to_f64 %2
  %c163840_i64 = arith.constant 163840 : i64
  %c1_i64 = arith.constant 1 : i64
  %cst_0 = arith.constant 1.000000e-05 : f64
  %4 = torch_c.from_f64 %cst_0
  %5 = torch_c.to_f64 %4
  %c3_i64 = arith.constant 3 : i64
  %c2_i64 = arith.constant 2 : i64
  %6 = tensor.empty() : tensor<2x32x10x16384xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %32 = arith.extf %in : f16 to f32
    linalg.yield %32 : f32
  } -> tensor<2x32x10x16384xf32>
  %8 = tensor.empty() : tensor<2x32x10x16384xf64>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%8 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f32, %out: f64):
    %32 = arith.extf %in : f32 to f64
    linalg.yield %32 : f64
  } -> tensor<2x32x10x16384xf64>
  %cst_1 = arith.constant 0.000000e+00 : f64
  %c0 = arith.constant 0 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c32 = arith.constant 32 : index
  %c2_2 = arith.constant 2 : index
  %c3 = arith.constant 3 : index
  %10 = tensor.empty(%c2, %c32) : tensor<?x?x1x1xf64>
  %11 = linalg.fill ins(%cst_1 : f64) outs(%10 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf64>) outs(%11 : tensor<?x?x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %32 = arith.addf %in, %out : f64
    linalg.yield %32 : f64
  } -> tensor<?x?x1x1xf64>
  %cast = tensor.cast %12 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64>
  %13 = tensor.empty() : tensor<2x32x1x1xf64>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast : tensor<2x32x1x1xf64>) outs(%13 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %32 = arith.sitofp %c163840_i64 : i64 to f64
    %33 = arith.divf %in, %32 : f64
    linalg.yield %33 : f64
  } -> tensor<2x32x1x1xf64>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %14 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%8 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_10: f64, %out: f64):
    %32 = arith.mulf %in_10, %3 : f64
    %33 = arith.subf %in, %32 : f64
    linalg.yield %33 : f64
  } -> tensor<2x32x10x16384xf64>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15, %15 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%8 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_10: f64, %out: f64):
    %32 = arith.mulf %in, %in_10 : f64
    linalg.yield %32 : f64
  } -> tensor<2x32x10x16384xf64>
  %c2_3 = arith.constant 2 : index
  %c32_4 = arith.constant 32 : index
  %17 = tensor.empty(%c2_3, %c32_4) : tensor<?x?x1x1xf64>
  %18 = linalg.fill ins(%cst_1 : f64) outs(%17 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64>
  %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%16 : tensor<2x32x10x16384xf64>) outs(%18 : tensor<?x?x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %32 = arith.addf %in, %out : f64
    linalg.yield %32 : f64
  } -> tensor<?x?x1x1xf64>
  %cast_5 = tensor.cast %19 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_5 : tensor<2x32x1x1xf64>) outs(%13 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %32 = arith.sitofp %c163840_i64 : i64 to f64
    %33 = arith.divf %in, %32 : f64
    linalg.yield %33 : f64
  } -> tensor<2x32x1x1xf64>
  %21 = tensor.empty() : tensor<2x32x1x1xf32>
  %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%20 : tensor<2x32x1x1xf64>) outs(%21 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f64, %out: f32):
    %32 = arith.truncf %in : f64 to f32
    linalg.yield %32 : f32
  } -> tensor<2x32x1x1xf32>
  %cst_6 = arith.constant 0.000000e+00 : f32
  %c2_7 = arith.constant 2 : index
  %c32_8 = arith.constant 32 : index
  %23 = tensor.empty(%c2_7, %c32_8) : tensor<?x?x1x1xf32>
  %24 = linalg.fill ins(%cst_6 : f32) outs(%23 : tensor<?x?x1x1xf32>) -> tensor<?x?x1x1xf32>
  %25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%24 : tensor<?x?x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %32 = arith.addf %in, %out : f32
    linalg.yield %32 : f32
  } -> tensor<?x?x1x1xf32>
  %cast_9 = tensor.cast %25 : tensor<?x?x1x1xf32> to tensor<2x32x1x1xf32>
  %26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_9 : tensor<2x32x1x1xf32>) outs(%21 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %32 = arith.sitofp %c163840_i64 : i64 to f32
    %33 = arith.divf %in, %32 : f32
    linalg.yield %33 : f32
  } -> tensor<2x32x1x1xf32>
  %27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22 : tensor<2x32x1x1xf32>) outs(%21 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %32 = arith.truncf %5 : f64 to f32
    %33 = arith.sitofp %c1_i64 : i64 to f32
    %34 = arith.mulf %32, %33 : f32
    %35 = arith.addf %in, %34 : f32
    linalg.yield %35 : f32
  } -> tensor<2x32x1x1xf32>
  %28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%27 : tensor<2x32x1x1xf32>) outs(%21 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %32 = math.rsqrt %in : f32
    linalg.yield %32 : f32
  } -> tensor<2x32x1x1xf32>
  %29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %26 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_10: f32, %out: f32):
    %32 = arith.extf %in : f16 to f32
    %33 = arith.sitofp %c1_i64 : i64 to f32
    %34 = arith.mulf %in_10, %33 : f32
    %35 = arith.subf %32, %34 : f32
    linalg.yield %35 : f32
  } -> tensor<2x32x10x16384xf32>
  %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %28 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_10: f32, %out: f32):
    %32 = arith.mulf %in, %in_10 : f32
    linalg.yield %32 : f32
  } -> tensor<2x32x10x16384xf32>
  %31 = torch_c.from_builtin_tensor %30 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %31 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %cst = arith.constant 1.638400e+05 : f32
  %cst_0 = arith.constant 1.638400e+05 : f64
  %cst_1 = arith.constant 1.000000e-05 : f64
  %cst_2 = arith.constant 0.000000e+00 : f32
  %cst_3 = arith.constant 0.000000e+00 : f64
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %28 = arith.extf %in : f16 to f32
    linalg.yield %28 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32x10x16384xf64>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f32, %out: f64):
    %28 = arith.extf %in : f32 to f64
    linalg.yield %28 : f64
  } -> tensor<2x32x10x16384xf64>
  %6 = tensor.empty() : tensor<2x32x1x1xf64>
  %7 = linalg.fill ins(%cst_3 : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %28 = arith.addf %in, %out : f64
    linalg.yield %28 : f64
  } -> tensor<2x32x1x1xf64>
  %9 = tensor.empty() : tensor<2x32x1x1xf64>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%9 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %28 = arith.divf %in, %cst_0 : f64
    linalg.yield %28 : f64
  } -> tensor<2x32x1x1xf64>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_4: f64, %out: f64):
    %28 = arith.subf %in, %in_4 : f64
    linalg.yield %28 : f64
  } -> tensor<2x32x10x16384xf64>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11, %11 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_4: f64, %out: f64):
    %28 = arith.mulf %in, %in_4 : f64
    linalg.yield %28 : f64
  } -> tensor<2x32x10x16384xf64>
  %13 = tensor.empty() : tensor<2x32x1x1xf64>
  %14 = linalg.fill ins(%cst_3 : f64) outs(%13 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%12 : tensor<2x32x10x16384xf64>) outs(%14 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %28 = arith.addf %in, %out : f64
    linalg.yield %28 : f64
  } -> tensor<2x32x1x1xf64>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf64>) outs(%9 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %28 = arith.divf %in, %cst_0 : f64
    linalg.yield %28 : f64
  } -> tensor<2x32x1x1xf64>
  %17 = tensor.empty() : tensor<2x32x1x1xf32>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<2x32x1x1xf64>) outs(%17 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f64, %out: f32):
    %28 = arith.truncf %in : f64 to f32
    linalg.yield %28 : f32
  } -> tensor<2x32x1x1xf32>
  %19 = tensor.empty() : tensor<2x32x1x1xf32>
  %20 = linalg.fill ins(%cst_2 : f32) outs(%19 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
  %21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%20 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %28 = arith.addf %in, %out : f32
    linalg.yield %28 : f32
  } -> tensor<2x32x1x1xf32>
  %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21 : tensor<2x32x1x1xf32>) outs(%17 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %28 = arith.divf %in, %cst : f32
    linalg.yield %28 : f32
  } -> tensor<2x32x1x1xf32>
  %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18 : tensor<2x32x1x1xf32>) outs(%17 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %28 = arith.truncf %cst_1 : f64 to f32
    %29 = arith.addf %in, %28 : f32
    linalg.yield %29 : f32
  } -> tensor<2x32x1x1xf32>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<2x32x1x1xf32>) outs(%17 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %28 = math.rsqrt %in : f32
    linalg.yield %28 : f32
  } -> tensor<2x32x1x1xf32>
  %25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %22 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_4: f32, %out: f32):
    %28 = arith.extf %in : f16 to f32
    %29 = arith.subf %28, %in_4 : f32
    linalg.yield %29 : f32
  } -> tensor<2x32x10x16384xf32>
  %26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%25, %24 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %28 = arith.mulf %in, %in_4 : f32
    linalg.yield %28 : f32
  } -> tensor<2x32x10x16384xf32>
  %27 = torch_c.from_builtin_tensor %26 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %27 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %cst = arith.constant 1.638400e+05 : f32
  %cst_0 = arith.constant 1.638400e+05 : f64
  %cst_1 = arith.constant 1.000000e-05 : f64
  %cst_2 = arith.constant 0.000000e+00 : f32
  %cst_3 = arith.constant 0.000000e+00 : f64
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %28 = arith.extf %in : f16 to f32
    linalg.yield %28 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32x10x16384xf64>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f32, %out: f64):
    %28 = arith.extf %in : f32 to f64
    linalg.yield %28 : f64
  } -> tensor<2x32x10x16384xf64>
  %6 = tensor.empty() : tensor<2x32x1x1xf64>
  %7 = linalg.fill ins(%cst_3 : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %28 = arith.addf %in, %out : f64
    linalg.yield %28 : f64
  } -> tensor<2x32x1x1xf64>
  %9 = tensor.empty() : tensor<2x32x1x1xf64>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%9 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %28 = arith.divf %in, %cst_0 : f64
    linalg.yield %28 : f64
  } -> tensor<2x32x1x1xf64>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_4: f64, %out: f64):
    %28 = arith.subf %in, %in_4 : f64
    linalg.yield %28 : f64
  } -> tensor<2x32x10x16384xf64>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11, %11 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_4: f64, %out: f64):
    %28 = arith.mulf %in, %in_4 : f64
    linalg.yield %28 : f64
  } -> tensor<2x32x10x16384xf64>
  %13 = tensor.empty() : tensor<2x32x1x1xf64>
  %14 = linalg.fill ins(%cst_3 : f64) outs(%13 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%12 : tensor<2x32x10x16384xf64>) outs(%14 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %28 = arith.addf %in, %out : f64
    linalg.yield %28 : f64
  } -> tensor<2x32x1x1xf64>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf64>) outs(%9 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %28 = arith.divf %in, %cst_0 : f64
    linalg.yield %28 : f64
  } -> tensor<2x32x1x1xf64>
  %17 = tensor.empty() : tensor<2x32x1x1xf32>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<2x32x1x1xf64>) outs(%17 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f64, %out: f32):
    %28 = arith.truncf %in : f64 to f32
    linalg.yield %28 : f32
  } -> tensor<2x32x1x1xf32>
  %19 = tensor.empty() : tensor<2x32x1x1xf32>
  %20 = linalg.fill ins(%cst_2 : f32) outs(%19 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
  %21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%20 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %28 = arith.addf %in, %out : f32
    linalg.yield %28 : f32
  } -> tensor<2x32x1x1xf32>
  %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21 : tensor<2x32x1x1xf32>) outs(%17 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %28 = arith.divf %in, %cst : f32
    linalg.yield %28 : f32
  } -> tensor<2x32x1x1xf32>
  %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18 : tensor<2x32x1x1xf32>) outs(%17 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %28 = arith.truncf %cst_1 : f64 to f32
    %29 = arith.addf %in, %28 : f32
    linalg.yield %29 : f32
  } -> tensor<2x32x1x1xf32>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<2x32x1x1xf32>) outs(%17 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %28 = math.rsqrt %in : f32
    linalg.yield %28 : f32
  } -> tensor<2x32x1x1xf32>
  %25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %22 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_4: f32, %out: f32):
    %28 = arith.extf %in : f16 to f32
    %29 = arith.subf %28, %in_4 : f32
    linalg.yield %29 : f32
  } -> tensor<2x32x10x16384xf32>
  %26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%25, %24 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %28 = arith.mulf %in, %in_4 : f32
    linalg.yield %28 : f32
  } -> tensor<2x32x10x16384xf32>
  %27 = torch_c.from_builtin_tensor %26 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %27 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %cst = arith.constant 1.638400e+05 : f32
  %cst_0 = arith.constant 1.638400e+05 : f64
  %cst_1 = arith.constant 1.000000e-05 : f64
  %cst_2 = arith.constant 0.000000e+00 : f32
  %cst_3 = arith.constant 0.000000e+00 : f64
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %24 = arith.extf %in : f16 to f32
    linalg.yield %24 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32x10x16384xf64>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f32, %out: f64):
    %24 = arith.extf %in : f32 to f64
    linalg.yield %24 : f64
  } -> tensor<2x32x10x16384xf64>
  %6 = tensor.empty() : tensor<2x32x1x1xf64>
  %7 = linalg.fill ins(%cst_3 : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %24 = arith.addf %in, %out : f64
    linalg.yield %24 : f64
  } -> tensor<2x32x1x1xf64>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %24 = arith.divf %in, %cst_0 : f64
    linalg.yield %24 : f64
  } -> tensor<2x32x1x1xf64>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_4: f64, %out: f64):
    %24 = arith.subf %in, %in_4 : f64
    linalg.yield %24 : f64
  } -> tensor<2x32x10x16384xf64>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_4: f64, %out: f64):
    %24 = arith.mulf %in, %in_4 : f64
    linalg.yield %24 : f64
  } -> tensor<2x32x10x16384xf64>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %24 = arith.addf %in, %out : f64
    linalg.yield %24 : f64
  } -> tensor<2x32x1x1xf64>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %24 = arith.divf %in, %cst_0 : f64
    linalg.yield %24 : f64
  } -> tensor<2x32x1x1xf64>
  %14 = tensor.empty() : tensor<2x32x1x1xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf64>) outs(%14 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f64, %out: f32):
    %24 = arith.truncf %in : f64 to f32
    linalg.yield %24 : f32
  } -> tensor<2x32x1x1xf32>
  %16 = linalg.fill ins(%cst_2 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
  %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %24 = arith.addf %in, %out : f32
    linalg.yield %24 : f32
  } -> tensor<2x32x1x1xf32>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %24 = arith.divf %in, %cst : f32
    linalg.yield %24 : f32
  } -> tensor<2x32x1x1xf32>
  %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %24 = arith.truncf %cst_1 : f64 to f32
    %25 = arith.addf %in, %24 : f32
    linalg.yield %25 : f32
  } -> tensor<2x32x1x1xf32>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %24 = math.rsqrt %in : f32
    linalg.yield %24 : f32
  } -> tensor<2x32x1x1xf32>
  %21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_4: f32, %out: f32):
    %24 = arith.extf %in : f16 to f32
    %25 = arith.subf %24, %in_4 : f32
    linalg.yield %25 : f32
  } -> tensor<2x32x10x16384xf32>
  %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %24 = arith.mulf %in, %in_4 : f32
    linalg.yield %24 : f32
  } -> tensor<2x32x10x16384xf32>
  %23 = torch_c.from_builtin_tensor %22 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %23 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
  %cst = arith.constant 1.638400e+05 : f32
  %cst_0 = arith.constant 1.638400e+05 : f64
  %cst_1 = arith.constant 1.000000e-05 : f64
  %cst_2 = arith.constant 0.000000e+00 : f32
  %cst_3 = arith.constant 0.000000e+00 : f64
  %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
  %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %24 = arith.extf %in : f16 to f32
    linalg.yield %24 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32x10x16384xf64>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f32, %out: f64):
    %24 = arith.extf %in : f32 to f64
    linalg.yield %24 : f64
  } -> tensor<2x32x10x16384xf64>
  %6 = tensor.empty() : tensor<2x32x1x1xf64>
  %7 = linalg.fill ins(%cst_3 : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %24 = arith.addf %in, %out : f64
    linalg.yield %24 : f64
  } -> tensor<2x32x1x1xf64>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %24 = arith.divf %in, %cst_0 : f64
    linalg.yield %24 : f64
  } -> tensor<2x32x1x1xf64>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_4: f64, %out: f64):
    %24 = arith.subf %in, %in_4 : f64
    linalg.yield %24 : f64
  } -> tensor<2x32x10x16384xf64>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_4: f64, %out: f64):
    %24 = arith.mulf %in, %in_4 : f64
    linalg.yield %24 : f64
  } -> tensor<2x32x10x16384xf64>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %24 = arith.addf %in, %out : f64
    linalg.yield %24 : f64
  } -> tensor<2x32x1x1xf64>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %24 = arith.divf %in, %cst_0 : f64
    linalg.yield %24 : f64
  } -> tensor<2x32x1x1xf64>
  %14 = tensor.empty() : tensor<2x32x1x1xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf64>) outs(%14 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f64, %out: f32):
    %24 = arith.truncf %in : f64 to f32
    linalg.yield %24 : f32
  } -> tensor<2x32x1x1xf32>
  %16 = linalg.fill ins(%cst_2 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
  %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %24 = arith.addf %in, %out : f32
    linalg.yield %24 : f32
  } -> tensor<2x32x1x1xf32>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %24 = arith.divf %in, %cst : f32
    linalg.yield %24 : f32
  } -> tensor<2x32x1x1xf32>
  %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %24 = arith.truncf %cst_1 : f64 to f32
    %25 = arith.addf %in, %24 : f32
    linalg.yield %25 : f32
  } -> tensor<2x32x1x1xf32>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %24 = math.rsqrt %in : f32
    linalg.yield %24 : f32
  } -> tensor<2x32x1x1xf32>
  %21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_4: f32, %out: f32):
    %24 = arith.extf %in : f16 to f32
    %25 = arith.subf %24, %in_4 : f32
    linalg.yield %25 : f32
  } -> tensor<2x32x10x16384xf32>
  %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %24 = arith.mulf %in, %in_4 : f32
    linalg.yield %24 : f32
  } -> tensor<2x32x10x16384xf32>
  %23 = torch_c.from_builtin_tensor %22 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32>
  return %23 : !torch.vtensor<[2,32,10,16384],f32>
 }

 // -----// IR Dump After Inliner (inline) //----- //
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>
 module {
  func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} {
    %cst = arith.constant 1.638400e+05 : f32
    %cst_0 = arith.constant 1.638400e+05 : f64
    %cst_1 = arith.constant 1.000000e-05 : f64
    %cst_2 = arith.constant 0.000000e+00 : f32
    %cst_3 = arith.constant 0.000000e+00 : f64
    %0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
    %1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %24 = arith.extf %in : f16 to f32
      linalg.yield %24 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32x10x16384xf64>
    %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f32, %out: f64):
      %24 = arith.extf %in : f32 to f64
      linalg.yield %24 : f64
    } -> tensor<2x32x10x16384xf64>
    %6 = tensor.empty() : tensor<2x32x1x1xf64>
    %7 = linalg.fill ins(%cst_3 : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64>
    %8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %24 = arith.addf %in, %out : f64
      linalg.yield %24 : f64
    } -> tensor<2x32x1x1xf64>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %24 = arith.divf %in, %cst_0 : f64
      linalg.yield %24 : f64
    } -> tensor<2x32x1x1xf64>
    %10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f64, %in_4: f64, %out: f64):
      %24 = arith.subf %in, %in_4 : f64
      linalg.yield %24 : f64
    } -> tensor<2x32x10x16384xf64>
    %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f64, %in_4: f64, %out: f64):
      %24 = arith.mulf %in, %in_4 : f64
      linalg.yield %24 : f64
    } -> tensor<2x32x10x16384xf64>
    %12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %24 = arith.addf %in, %out : f64
      linalg.yield %24 : f64
    } -> tensor<2x32x1x1xf64>
    %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %24 = arith.divf %in, %cst_0 : f64
      linalg.yield %24 : f64
    } -> tensor<2x32x1x1xf64>
    %14 = tensor.empty() : tensor<2x32x1x1xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf64>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f64, %out: f32):
      %24 = arith.truncf %in : f64 to f32
      linalg.yield %24 : f32
    } -> tensor<2x32x1x1xf32>
    %16 = linalg.fill ins(%cst_2 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %17 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %24 = arith.addf %in, %out : f32
      linalg.yield %24 : f32
    } -> tensor<2x32x1x1xf32>
    %18 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %24 = arith.divf %in, %cst : f32
      linalg.yield %24 : f32
    } -> tensor<2x32x1x1xf32>
    %19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %24 = arith.truncf %cst_1 : f64 to f32
      %25 = arith.addf %in, %24 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %24 = math.rsqrt %in : f32
      linalg.yield %24 : f32
    } -> tensor<2x32x1x1xf32>
    %21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_4: f32, %out: f32):
      %24 = arith.extf %in : f16 to f32
      %25 = arith.subf %24, %in_4 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %22 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %24 = arith.mulf %in, %in_4 : f32
      linalg.yield %24 : f32
    } -> tensor<2x32x10x16384xf32>
    %23 = torch_c.from_builtin_tensor %22 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32>
    return %23 : !torch.vtensor<[2,32,10,16384],f32>
  }
 }


 // -----// IR Dump After FuncConversionPass (torch-iree-func-conversion) //----- //
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>
 module {
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = torch_c.from_builtin_tensor %0 : tensor<2x32x10x16384xf16> -> !torch.vtensor<[2,32,10,16384],f16>
    %2 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %3 = torch_c.from_builtin_tensor %2 : tensor<2x32x10x16384xf16> -> !torch.vtensor<[2,32,10,16384],f16>
    %cst = arith.constant 1.638400e+05 : f32
    %cst_0 = arith.constant 1.638400e+05 : f64
    %cst_1 = arith.constant 1.000000e-05 : f64
    %cst_2 = arith.constant 0.000000e+00 : f32
    %cst_3 = arith.constant 0.000000e+00 : f64
    %4 = torch_c.to_builtin_tensor %3 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
    %5 = torch_c.to_builtin_tensor %1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
    %6 = tensor.empty() : tensor<2x32x10x16384xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %30 = arith.extf %in : f16 to f32
      linalg.yield %30 : f32
    } -> tensor<2x32x10x16384xf32>
    %8 = tensor.empty() : tensor<2x32x10x16384xf64>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%8 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f32, %out: f64):
      %30 = arith.extf %in : f32 to f64
      linalg.yield %30 : f64
    } -> tensor<2x32x10x16384xf64>
    %10 = tensor.empty() : tensor<2x32x1x1xf64>
    %11 = linalg.fill ins(%cst_3 : f64) outs(%10 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64>
    %12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf64>) outs(%11 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %30 = arith.addf %in, %out : f64
      linalg.yield %30 : f64
    } -> tensor<2x32x1x1xf64>
    %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%10 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %30 = arith.divf %in, %cst_0 : f64
      linalg.yield %30 : f64
    } -> tensor<2x32x1x1xf64>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %13 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%8 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f64, %in_4: f64, %out: f64):
      %30 = arith.subf %in, %in_4 : f64
      linalg.yield %30 : f64
    } -> tensor<2x32x10x16384xf64>
    %15 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %14 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%8 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f64, %in_4: f64, %out: f64):
      %30 = arith.mulf %in, %in_4 : f64
      linalg.yield %30 : f64
    } -> tensor<2x32x10x16384xf64>
    %16 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%15 : tensor<2x32x10x16384xf64>) outs(%11 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %30 = arith.addf %in, %out : f64
      linalg.yield %30 : f64
    } -> tensor<2x32x1x1xf64>
    %17 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<2x32x1x1xf64>) outs(%10 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %30 = arith.divf %in, %cst_0 : f64
      linalg.yield %30 : f64
    } -> tensor<2x32x1x1xf64>
    %18 = tensor.empty() : tensor<2x32x1x1xf32>
    %19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf64>) outs(%18 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f64, %out: f32):
      %30 = arith.truncf %in : f64 to f32
      linalg.yield %30 : f32
    } -> tensor<2x32x1x1xf32>
    %20 = linalg.fill ins(%cst_2 : f32) outs(%18 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %21 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%20 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %30 = arith.addf %in, %out : f32
      linalg.yield %30 : f32
    } -> tensor<2x32x1x1xf32>
    %22 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21 : tensor<2x32x1x1xf32>) outs(%18 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %30 = arith.divf %in, %cst : f32
      linalg.yield %30 : f32
    } -> tensor<2x32x1x1xf32>
    %23 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%18 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %30 = arith.truncf %cst_1 : f64 to f32
      %31 = arith.addf %in, %30 : f32
      linalg.yield %31 : f32
    } -> tensor<2x32x1x1xf32>
    %24 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<2x32x1x1xf32>) outs(%18 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %30 = math.rsqrt %in : f32
      linalg.yield %30 : f32
    } -> tensor<2x32x1x1xf32>
    %25 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %22 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_4: f32, %out: f32):
      %30 = arith.extf %in : f16 to f32
      %31 = arith.subf %30, %in_4 : f32
      linalg.yield %31 : f32
    } -> tensor<2x32x10x16384xf32>
    %26 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%25, %24 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %30 = arith.mulf %in, %in_4 : f32
      linalg.yield %30 : f32
    } -> tensor<2x32x10x16384xf32>
    %27 = torch_c.from_builtin_tensor %26 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32>
    %28 = hal.tensor.barrier join(%26 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %29 = hal.tensor.export %28 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %29 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %0 = util.null : !hal.fence
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f64
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant 1.000000e-05 : f64
  %cst_2 = arith.constant 1.638400e+05 : f64
  %cst_3 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = torch_c.from_builtin_tensor %0 : tensor<2x32x10x16384xf16> -> !torch.vtensor<[2,32,10,16384],f16>
  %2 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %3 = torch_c.from_builtin_tensor %2 : tensor<2x32x10x16384xf16> -> !torch.vtensor<[2,32,10,16384],f16>
  %4 = torch_c.to_builtin_tensor %3 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
  %5 = torch_c.to_builtin_tensor %1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
  %6 = tensor.empty() : tensor<2x32x10x16384xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %29 = arith.extf %in : f16 to f32
    linalg.yield %29 : f32
  } -> tensor<2x32x10x16384xf32>
  %8 = tensor.empty() : tensor<2x32x10x16384xf64>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%8 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f32, %out: f64):
    %29 = arith.extf %in : f32 to f64
    linalg.yield %29 : f64
  } -> tensor<2x32x10x16384xf64>
  %10 = tensor.empty() : tensor<2x32x1x1xf64>
  %11 = linalg.fill ins(%cst : f64) outs(%10 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf64>) outs(%11 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %29 = arith.addf %in, %out : f64
    linalg.yield %29 : f64
  } -> tensor<2x32x1x1xf64>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%10 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %29 = arith.divf %in, %cst_2 : f64
    linalg.yield %29 : f64
  } -> tensor<2x32x1x1xf64>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %13 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%8 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_4: f64, %out: f64):
    %29 = arith.subf %in, %in_4 : f64
    linalg.yield %29 : f64
  } -> tensor<2x32x10x16384xf64>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %14 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%8 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_4: f64, %out: f64):
    %29 = arith.mulf %in, %in_4 : f64
    linalg.yield %29 : f64
  } -> tensor<2x32x10x16384xf64>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%15 : tensor<2x32x10x16384xf64>) outs(%11 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %29 = arith.addf %in, %out : f64
    linalg.yield %29 : f64
  } -> tensor<2x32x1x1xf64>
  %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<2x32x1x1xf64>) outs(%10 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %29 = arith.divf %in, %cst_2 : f64
    linalg.yield %29 : f64
  } -> tensor<2x32x1x1xf64>
  %18 = tensor.empty() : tensor<2x32x1x1xf32>
  %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf64>) outs(%18 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f64, %out: f32):
    %29 = arith.truncf %in : f64 to f32
    linalg.yield %29 : f32
  } -> tensor<2x32x1x1xf32>
  %20 = linalg.fill ins(%cst_0 : f32) outs(%18 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
  %21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%20 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %29 = arith.addf %in, %out : f32
    linalg.yield %29 : f32
  } -> tensor<2x32x1x1xf32>
  %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21 : tensor<2x32x1x1xf32>) outs(%18 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %29 = arith.divf %in, %cst_3 : f32
    linalg.yield %29 : f32
  } -> tensor<2x32x1x1xf32>
  %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%18 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %29 = arith.truncf %cst_1 : f64 to f32
    %30 = arith.addf %in, %29 : f32
    linalg.yield %30 : f32
  } -> tensor<2x32x1x1xf32>
  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<2x32x1x1xf32>) outs(%18 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %29 = math.rsqrt %in : f32
    linalg.yield %29 : f32
  } -> tensor<2x32x1x1xf32>
  %25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %22 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_4: f32, %out: f32):
    %29 = arith.extf %in : f16 to f32
    %30 = arith.subf %29, %in_4 : f32
    linalg.yield %30 : f32
  } -> tensor<2x32x10x16384xf32>
  %26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%25, %24 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %29 = arith.mulf %in, %in_4 : f32
    linalg.yield %29 : f32
  } -> tensor<2x32x10x16384xf32>
  %27 = hal.tensor.barrier join(%26 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %28 = hal.tensor.export %27 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %28 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After SymbolDCE (symbol-dce) //----- //
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>
 module {
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f64
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant 1.000000e-05 : f64
    %cst_2 = arith.constant 1.638400e+05 : f64
    %cst_3 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = torch_c.from_builtin_tensor %0 : tensor<2x32x10x16384xf16> -> !torch.vtensor<[2,32,10,16384],f16>
    %2 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %3 = torch_c.from_builtin_tensor %2 : tensor<2x32x10x16384xf16> -> !torch.vtensor<[2,32,10,16384],f16>
    %4 = torch_c.to_builtin_tensor %3 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
    %5 = torch_c.to_builtin_tensor %1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16>
    %6 = tensor.empty() : tensor<2x32x10x16384xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %29 = arith.extf %in : f16 to f32
      linalg.yield %29 : f32
    } -> tensor<2x32x10x16384xf32>
    %8 = tensor.empty() : tensor<2x32x10x16384xf64>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%8 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f32, %out: f64):
      %29 = arith.extf %in : f32 to f64
      linalg.yield %29 : f64
    } -> tensor<2x32x10x16384xf64>
    %10 = tensor.empty() : tensor<2x32x1x1xf64>
    %11 = linalg.fill ins(%cst : f64) outs(%10 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64>
    %12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf64>) outs(%11 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %29 = arith.addf %in, %out : f64
      linalg.yield %29 : f64
    } -> tensor<2x32x1x1xf64>
    %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%10 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %29 = arith.divf %in, %cst_2 : f64
      linalg.yield %29 : f64
    } -> tensor<2x32x1x1xf64>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %13 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%8 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f64, %in_4: f64, %out: f64):
      %29 = arith.subf %in, %in_4 : f64
      linalg.yield %29 : f64
    } -> tensor<2x32x10x16384xf64>
    %15 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %14 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%8 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f64, %in_4: f64, %out: f64):
      %29 = arith.mulf %in, %in_4 : f64
      linalg.yield %29 : f64
    } -> tensor<2x32x10x16384xf64>
    %16 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%15 : tensor<2x32x10x16384xf64>) outs(%11 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %29 = arith.addf %in, %out : f64
      linalg.yield %29 : f64
    } -> tensor<2x32x1x1xf64>
    %17 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<2x32x1x1xf64>) outs(%10 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %29 = arith.divf %in, %cst_2 : f64
      linalg.yield %29 : f64
    } -> tensor<2x32x1x1xf64>
    %18 = tensor.empty() : tensor<2x32x1x1xf32>
    %19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf64>) outs(%18 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f64, %out: f32):
      %29 = arith.truncf %in : f64 to f32
      linalg.yield %29 : f32
    } -> tensor<2x32x1x1xf32>
    %20 = linalg.fill ins(%cst_0 : f32) outs(%18 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %21 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%20 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %29 = arith.addf %in, %out : f32
      linalg.yield %29 : f32
    } -> tensor<2x32x1x1xf32>
    %22 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21 : tensor<2x32x1x1xf32>) outs(%18 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %29 = arith.divf %in, %cst_3 : f32
      linalg.yield %29 : f32
    } -> tensor<2x32x1x1xf32>
    %23 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%18 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %29 = arith.truncf %cst_1 : f64 to f32
      %30 = arith.addf %in, %29 : f32
      linalg.yield %30 : f32
    } -> tensor<2x32x1x1xf32>
    %24 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<2x32x1x1xf32>) outs(%18 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %29 = math.rsqrt %in : f32
      linalg.yield %29 : f32
    } -> tensor<2x32x1x1xf32>
    %25 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %22 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_4: f32, %out: f32):
      %29 = arith.extf %in : f16 to f32
      %30 = arith.subf %29, %in_4 : f32
      linalg.yield %30 : f32
    } -> tensor<2x32x10x16384xf32>
    %26 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%25, %24 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %29 = arith.mulf %in, %in_4 : f32
      linalg.yield %29 : f32
    } -> tensor<2x32x10x16384xf32>
    %27 = hal.tensor.barrier join(%26 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %28 = hal.tensor.export %27 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %28 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FinalizingBackendTypeConversion (torch-finalizing-backend-type-conversion) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f64
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant 1.000000e-05 : f64
  %cst_2 = arith.constant 1.638400e+05 : f64
  %cst_3 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %25 = arith.extf %in : f16 to f32
    linalg.yield %25 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32x10x16384xf64>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f32, %out: f64):
    %25 = arith.extf %in : f32 to f64
    linalg.yield %25 : f64
  } -> tensor<2x32x10x16384xf64>
  %6 = tensor.empty() : tensor<2x32x1x1xf64>
  %7 = linalg.fill ins(%cst : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %25 = arith.addf %in, %out : f64
    linalg.yield %25 : f64
  } -> tensor<2x32x1x1xf64>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %25 = arith.divf %in, %cst_2 : f64
    linalg.yield %25 : f64
  } -> tensor<2x32x1x1xf64>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_4: f64, %out: f64):
    %25 = arith.subf %in, %in_4 : f64
    linalg.yield %25 : f64
  } -> tensor<2x32x10x16384xf64>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
  ^bb0(%in: f64, %in_4: f64, %out: f64):
    %25 = arith.mulf %in, %in_4 : f64
    linalg.yield %25 : f64
  } -> tensor<2x32x10x16384xf64>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %25 = arith.addf %in, %out : f64
    linalg.yield %25 : f64
  } -> tensor<2x32x1x1xf64>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) {
  ^bb0(%in: f64, %out: f64):
    %25 = arith.divf %in, %cst_2 : f64
    linalg.yield %25 : f64
  } -> tensor<2x32x1x1xf64>
  %14 = tensor.empty() : tensor<2x32x1x1xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf64>) outs(%14 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f64, %out: f32):
    %25 = arith.truncf %in : f64 to f32
    linalg.yield %25 : f32
  } -> tensor<2x32x1x1xf32>
  %16 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
  %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %25 = arith.addf %in, %out : f32
    linalg.yield %25 : f32
  } -> tensor<2x32x1x1xf32>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %25 = arith.divf %in, %cst_3 : f32
    linalg.yield %25 : f32
  } -> tensor<2x32x1x1xf32>
  %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %25 = arith.truncf %cst_1 : f64 to f32
    %26 = arith.addf %in, %25 : f32
    linalg.yield %26 : f32
  } -> tensor<2x32x1x1xf32>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %25 = math.rsqrt %in : f32
    linalg.yield %25 : f32
  } -> tensor<2x32x1x1xf32>
  %21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_4: f32, %out: f32):
    %25 = arith.extf %in : f16 to f32
    %26 = arith.subf %25, %in_4 : f32
    linalg.yield %26 : f32
  } -> tensor<2x32x10x16384xf32>
  %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %25 = arith.mulf %in, %in_4 : f32
    linalg.yield %25 : f32
  } -> tensor<2x32x10x16384xf32>
  %23 = hal.tensor.barrier join(%22 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %24 = hal.tensor.export %23 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %24 : !hal.buffer_view
 }

 // -----// IR Dump After FinalizingBackendTypeConversion (torch-finalizing-backend-type-conversion) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>
 module {
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f64
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant 1.000000e-05 : f64
    %cst_2 = arith.constant 1.638400e+05 : f64
    %cst_3 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %25 = arith.extf %in : f16 to f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32x10x16384xf64>
    %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f32, %out: f64):
      %25 = arith.extf %in : f32 to f64
      linalg.yield %25 : f64
    } -> tensor<2x32x10x16384xf64>
    %6 = tensor.empty() : tensor<2x32x1x1xf64>
    %7 = linalg.fill ins(%cst : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64>
    %8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %25 = arith.addf %in, %out : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x1x1xf64>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %25 = arith.divf %in, %cst_2 : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x1x1xf64>
    %10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f64, %in_4: f64, %out: f64):
      %25 = arith.subf %in, %in_4 : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x10x16384xf64>
    %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f64, %in_4: f64, %out: f64):
      %25 = arith.mulf %in, %in_4 : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x10x16384xf64>
    %12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %25 = arith.addf %in, %out : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x1x1xf64>
    %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %25 = arith.divf %in, %cst_2 : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x1x1xf64>
    %14 = tensor.empty() : tensor<2x32x1x1xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf64>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f64, %out: f32):
      %25 = arith.truncf %in : f64 to f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %16 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %17 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.addf %in, %out : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %18 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.divf %in, %cst_3 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.truncf %cst_1 : f64 to f32
      %26 = arith.addf %in, %25 : f32
      linalg.yield %26 : f32
    } -> tensor<2x32x1x1xf32>
    %20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = math.rsqrt %in : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_4: f32, %out: f32):
      %25 = arith.extf %in : f16 to f32
      %26 = arith.subf %25, %in_4 : f32
      linalg.yield %26 : f32
    } -> tensor<2x32x10x16384xf32>
    %22 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %25 = arith.mulf %in, %in_4 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %23 = hal.tensor.barrier join(%22 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %24 = hal.tensor.export %23 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %24 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- //
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>
 module {
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f64
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant 1.000000e-05 : f64
    %cst_2 = arith.constant 1.638400e+05 : f64
    %cst_3 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %25 = arith.extf %in : f16 to f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32x10x16384xf64>
    %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f32, %out: f64):
      %25 = arith.extf %in : f32 to f64
      linalg.yield %25 : f64
    } -> tensor<2x32x10x16384xf64>
    %6 = tensor.empty() : tensor<2x32x1x1xf64>
    %7 = linalg.fill ins(%cst : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64>
    %8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %25 = arith.addf %in, %out : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x1x1xf64>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %25 = arith.divf %in, %cst_2 : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x1x1xf64>
    %10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f64, %in_4: f64, %out: f64):
      %25 = arith.subf %in, %in_4 : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x10x16384xf64>
    %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f64, %in_4: f64, %out: f64):
      %25 = arith.mulf %in, %in_4 : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x10x16384xf64>
    %12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %25 = arith.addf %in, %out : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x1x1xf64>
    %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %25 = arith.divf %in, %cst_2 : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x1x1xf64>
    %14 = tensor.empty() : tensor<2x32x1x1xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf64>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f64, %out: f32):
      %25 = arith.truncf %in : f64 to f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %16 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %17 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.addf %in, %out : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %18 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.divf %in, %cst_3 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.truncf %cst_1 : f64 to f32
      %26 = arith.addf %in, %25 : f32
      linalg.yield %26 : f32
    } -> tensor<2x32x1x1xf32>
    %20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = math.rsqrt %in : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_4: f32, %out: f32):
      %25 = arith.extf %in : f16 to f32
      %26 = arith.subf %25, %in_4 : f32
      linalg.yield %26 : f32
    } -> tensor<2x32x10x16384xf32>
    %22 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %25 = arith.mulf %in, %in_4 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %23 = hal.tensor.barrier join(%22 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %24 = hal.tensor.export %23 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %24 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ImportMLProgramPass (iree-import-ml-program) //----- //
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>
 module {
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f64
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant 1.000000e-05 : f64
    %cst_2 = arith.constant 1.638400e+05 : f64
    %cst_3 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %25 = arith.extf %in : f16 to f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32x10x16384xf64>
    %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f32, %out: f64):
      %25 = arith.extf %in : f32 to f64
      linalg.yield %25 : f64
    } -> tensor<2x32x10x16384xf64>
    %6 = tensor.empty() : tensor<2x32x1x1xf64>
    %7 = linalg.fill ins(%cst : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64>
    %8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %25 = arith.addf %in, %out : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x1x1xf64>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %25 = arith.divf %in, %cst_2 : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x1x1xf64>
    %10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f64, %in_4: f64, %out: f64):
      %25 = arith.subf %in, %in_4 : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x10x16384xf64>
    %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f64, %in_4: f64, %out: f64):
      %25 = arith.mulf %in, %in_4 : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x10x16384xf64>
    %12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %25 = arith.addf %in, %out : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x1x1xf64>
    %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %25 = arith.divf %in, %cst_2 : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x1x1xf64>
    %14 = tensor.empty() : tensor<2x32x1x1xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf64>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f64, %out: f32):
      %25 = arith.truncf %in : f64 to f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %16 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %17 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.addf %in, %out : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %18 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.divf %in, %cst_3 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.truncf %cst_1 : f64 to f32
      %26 = arith.addf %in, %25 : f32
      linalg.yield %26 : f32
    } -> tensor<2x32x1x1xf32>
    %20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = math.rsqrt %in : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_4: f32, %out: f32):
      %25 = arith.extf %in : f16 to f32
      %26 = arith.subf %25, %in_4 : f32
      linalg.yield %26 : f32
    } -> tensor<2x32x10x16384xf32>
    %22 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %25 = arith.mulf %in, %in_4 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %23 = hal.tensor.barrier join(%22 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %24 = hal.tensor.export %23 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %24 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After SanitizeModuleNamesPass (iree-sanitize-module-names) //----- //
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>
 module {
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f64
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant 1.000000e-05 : f64
    %cst_2 = arith.constant 1.638400e+05 : f64
    %cst_3 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %25 = arith.extf %in : f16 to f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32x10x16384xf64>
    %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f32, %out: f64):
      %25 = arith.extf %in : f32 to f64
      linalg.yield %25 : f64
    } -> tensor<2x32x10x16384xf64>
    %6 = tensor.empty() : tensor<2x32x1x1xf64>
    %7 = linalg.fill ins(%cst : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64>
    %8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %25 = arith.addf %in, %out : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x1x1xf64>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %25 = arith.divf %in, %cst_2 : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x1x1xf64>
    %10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f64, %in_4: f64, %out: f64):
      %25 = arith.subf %in, %in_4 : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x10x16384xf64>
    %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f64, %in_4: f64, %out: f64):
      %25 = arith.mulf %in, %in_4 : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x10x16384xf64>
    %12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %25 = arith.addf %in, %out : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x1x1xf64>
    %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %25 = arith.divf %in, %cst_2 : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x1x1xf64>
    %14 = tensor.empty() : tensor<2x32x1x1xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf64>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f64, %out: f32):
      %25 = arith.truncf %in : f64 to f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %16 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %17 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.addf %in, %out : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %18 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.divf %in, %cst_3 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.truncf %cst_1 : f64 to f32
      %26 = arith.addf %in, %25 : f32
      linalg.yield %26 : f32
    } -> tensor<2x32x1x1xf32>
    %20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = math.rsqrt %in : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_4: f32, %out: f32):
      %25 = arith.extf %in : f16 to f32
      %26 = arith.subf %25, %in_4 : f32
      linalg.yield %26 : f32
    } -> tensor<2x32x10x16384xf32>
    %22 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %25 = arith.mulf %in, %in_4 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %23 = hal.tensor.barrier join(%22 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %24 = hal.tensor.export %23 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %24 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- //
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>
 module {
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f64
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant 1.000000e-05 : f64
    %cst_2 = arith.constant 1.638400e+05 : f64
    %cst_3 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %25 = arith.extf %in : f16 to f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32x10x16384xf64>
    %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f32, %out: f64):
      %25 = arith.extf %in : f32 to f64
      linalg.yield %25 : f64
    } -> tensor<2x32x10x16384xf64>
    %6 = tensor.empty() : tensor<2x32x1x1xf64>
    %7 = linalg.fill ins(%cst : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64>
    %8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %25 = arith.addf %in, %out : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x1x1xf64>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %25 = arith.divf %in, %cst_2 : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x1x1xf64>
    %10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f64, %in_4: f64, %out: f64):
      %25 = arith.subf %in, %in_4 : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x10x16384xf64>
    %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) {
    ^bb0(%in: f64, %in_4: f64, %out: f64):
      %25 = arith.mulf %in, %in_4 : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x10x16384xf64>
    %12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %25 = arith.addf %in, %out : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x1x1xf64>
    %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) {
    ^bb0(%in: f64, %out: f64):
      %25 = arith.divf %in, %cst_2 : f64
      linalg.yield %25 : f64
    } -> tensor<2x32x1x1xf64>
    %14 = tensor.empty() : tensor<2x32x1x1xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf64>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f64, %out: f32):
      %25 = arith.truncf %in : f64 to f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %16 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %17 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.addf %in, %out : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %18 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.divf %in, %cst_3 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.truncf %cst_1 : f64 to f32
      %26 = arith.addf %in, %25 : f32
      linalg.yield %26 : f32
    } -> tensor<2x32x1x1xf32>
    %20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = math.rsqrt %in : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_4: f32, %out: f32):
      %25 = arith.extf %in : f16 to f32
      %26 = arith.subf %25, %in_4 : f32
      linalg.yield %26 : f32
    } -> tensor<2x32x10x16384xf32>
    %22 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %25 = arith.mulf %in, %in_4 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %23 = hal.tensor.barrier join(%22 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %24 = hal.tensor.export %23 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %24 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- //
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>
 module {
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant 9.99999974E-6 : f32
    %cst_2 = arith.constant 1.638400e+05 : f32
    %cst_3 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %25 = arith.extf %in : f16 to f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32x10x16384xf32>
    %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<2x32x10x16384xf32>
    %6 = tensor.empty() : tensor<2x32x1x1xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf32>) outs(%7 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.addf %in, %out : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.divf %in, %cst_2 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %25 = arith.subf %in, %in_4 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %25 = arith.mulf %in, %in_4 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf32>) outs(%7 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.addf %in, %out : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.divf %in, %cst_2 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %14 = tensor.empty() : tensor<2x32x1x1xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<2x32x1x1xf32>
    %16 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %17 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.addf %in, %out : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %18 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.divf %in, %cst_3 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.addf %in, %cst_1 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = math.rsqrt %in : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_4: f32, %out: f32):
      %25 = arith.extf %in : f16 to f32
      %26 = arith.subf %25, %in_4 : f32
      linalg.yield %26 : f32
    } -> tensor<2x32x10x16384xf32>
    %22 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %25 = arith.mulf %in, %in_4 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %23 = hal.tensor.barrier join(%22 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %24 = hal.tensor.export %23 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %24 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- //
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>
 module {
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant 9.99999974E-6 : f32
    %cst_2 = arith.constant 1.638400e+05 : f32
    %cst_3 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %25 = arith.extf %in : f16 to f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32x10x16384xf32>
    %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<2x32x10x16384xf32>
    %6 = tensor.empty() : tensor<2x32x1x1xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf32>) outs(%7 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.addf %in, %out : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.divf %in, %cst_2 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %25 = arith.subf %in, %in_4 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %25 = arith.mulf %in, %in_4 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf32>) outs(%7 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.addf %in, %out : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.divf %in, %cst_2 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %14 = tensor.empty() : tensor<2x32x1x1xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<2x32x1x1xf32>
    %16 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %17 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.addf %in, %out : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %18 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.divf %in, %cst_3 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.addf %in, %cst_1 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = math.rsqrt %in : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_4: f32, %out: f32):
      %25 = arith.extf %in : f16 to f32
      %26 = arith.subf %25, %in_4 : f32
      linalg.yield %26 : f32
    } -> tensor<2x32x10x16384xf32>
    %22 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %25 = arith.mulf %in, %in_4 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %23 = hal.tensor.barrier join(%22 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %24 = hal.tensor.export %23 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %24 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- //
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>
 module {
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant 9.99999974E-6 : f32
    %cst_2 = arith.constant 1.638400e+05 : f32
    %cst_3 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %25 = arith.extf %in : f16 to f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32x10x16384xf32>
    %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<2x32x10x16384xf32>
    %6 = tensor.empty() : tensor<2x32x1x1xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf32>) outs(%7 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.addf %in, %out : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.divf %in, %cst_2 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %25 = arith.subf %in, %in_4 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %25 = arith.mulf %in, %in_4 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf32>) outs(%7 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.addf %in, %out : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.divf %in, %cst_2 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %14 = tensor.empty() : tensor<2x32x1x1xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<2x32x1x1xf32>
    %16 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %17 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.addf %in, %out : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %18 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.divf %in, %cst_3 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = arith.addf %in, %cst_1 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %25 = math.rsqrt %in : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x1x1xf32>
    %21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_4: f32, %out: f32):
      %25 = arith.extf %in : f16 to f32
      %26 = arith.subf %25, %in_4 : f32
      linalg.yield %26 : f32
    } -> tensor<2x32x10x16384xf32>
    %22 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %25 = arith.mulf %in, %in_4 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %23 = hal.tensor.barrier join(%22 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %24 = hal.tensor.export %23 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %24 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %23 = arith.extf %in : f16 to f32
    linalg.yield %23 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32x10x16384xf32>
  %5 = tensor.empty() : tensor<2x32x1x1xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%6 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %23 = arith.addf %in, %out : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x1x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x32x1x1xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %23 = arith.divf %in, %cst_1 : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x1x1xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %23 = arith.subf %in, %in_2 : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %9 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %23 = arith.mulf %in, %in_2 : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x10x16384xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%10 : tensor<2x32x10x16384xf32>) outs(%6 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %23 = arith.addf %in, %out : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x1x1xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %23 = arith.divf %in, %cst_1 : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x1x1xf32>
  %13 = tensor.empty() : tensor<2x32x1x1xf32>
  %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %23 = arith.addf %in, %out : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x1x1xf32>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%13 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %23 = arith.divf %in, %cst_1 : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x1x1xf32>
  %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%13 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %23 = arith.addf %in, %cst_0 : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x1x1xf32>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%13 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %23 = math.rsqrt %in : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x1x1xf32>
  %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %16 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %23 = arith.extf %in : f16 to f32
    %24 = arith.subf %23, %in_2 : f32
    linalg.yield %24 : f32
  } -> tensor<2x32x10x16384xf32>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %18 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %23 = arith.mulf %in, %in_2 : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x10x16384xf32>
  %21 = hal.tensor.barrier join(%20 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %22 = hal.tensor.export %21 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %22 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After Inliner (inline) //----- //
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>
 module {
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %23 = arith.extf %in : f16 to f32
      linalg.yield %23 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32x10x16384xf32>
    %5 = tensor.empty() : tensor<2x32x1x1xf32>
    %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%6 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %23 = arith.addf %in, %out : f32
      linalg.yield %23 : f32
    } -> tensor<2x32x1x1xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x32x1x1xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %23 = arith.divf %in, %cst_1 : f32
      linalg.yield %23 : f32
    } -> tensor<2x32x1x1xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %23 = arith.subf %in, %in_2 : f32
      linalg.yield %23 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %9 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %23 = arith.mulf %in, %in_2 : f32
      linalg.yield %23 : f32
    } -> tensor<2x32x10x16384xf32>
    %11 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%10 : tensor<2x32x10x16384xf32>) outs(%6 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %23 = arith.addf %in, %out : f32
      linalg.yield %23 : f32
    } -> tensor<2x32x1x1xf32>
    %12 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %23 = arith.divf %in, %cst_1 : f32
      linalg.yield %23 : f32
    } -> tensor<2x32x1x1xf32>
    %13 = tensor.empty() : tensor<2x32x1x1xf32>
    %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %23 = arith.addf %in, %out : f32
      linalg.yield %23 : f32
    } -> tensor<2x32x1x1xf32>
    %16 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%13 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %23 = arith.divf %in, %cst_1 : f32
      linalg.yield %23 : f32
    } -> tensor<2x32x1x1xf32>
    %17 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%13 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %23 = arith.addf %in, %cst_0 : f32
      linalg.yield %23 : f32
    } -> tensor<2x32x1x1xf32>
    %18 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%13 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %23 = math.rsqrt %in : f32
      linalg.yield %23 : f32
    } -> tensor<2x32x1x1xf32>
    %19 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %16 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %23 = arith.extf %in : f16 to f32
      %24 = arith.subf %23, %in_2 : f32
      linalg.yield %24 : f32
    } -> tensor<2x32x10x16384xf32>
    %20 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %18 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %23 = arith.mulf %in, %in_2 : f32
      linalg.yield %23 : f32
    } -> tensor<2x32x10x16384xf32>
    %21 = hal.tensor.barrier join(%20 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %22 = hal.tensor.export %21 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %22 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %23 = arith.extf %in : f16 to f32
    linalg.yield %23 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32x10x16384xf32>
  %5 = tensor.empty() : tensor<2x32x1x1xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%6 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %23 = arith.addf %in, %out : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x1x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x32x1x1xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %23 = arith.divf %in, %cst_1 : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x1x1xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %23 = arith.subf %in, %in_2 : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %9 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %23 = arith.mulf %in, %in_2 : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x10x16384xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%10 : tensor<2x32x10x16384xf32>) outs(%6 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %23 = arith.addf %in, %out : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x1x1xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %23 = arith.divf %in, %cst_1 : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x1x1xf32>
  %13 = tensor.empty() : tensor<2x32x1x1xf32>
  %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%14 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %23 = arith.addf %in, %out : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x1x1xf32>
  %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%13 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %23 = arith.divf %in, %cst_1 : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x1x1xf32>
  %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%13 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %23 = arith.addf %in, %cst_0 : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x1x1xf32>
  %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%13 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %23 = math.rsqrt %in : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x1x1xf32>
  %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %16 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %23 = arith.extf %in : f16 to f32
    %24 = arith.subf %23, %in_2 : f32
    linalg.yield %24 : f32
  } -> tensor<2x32x10x16384xf32>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %18 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %23 = arith.mulf %in, %in_2 : f32
    linalg.yield %23 : f32
  } -> tensor<2x32x10x16384xf32>
  %21 = hal.tensor.barrier join(%20 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %22 = hal.tensor.export %21 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %22 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32x1x1xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After SymbolDCE (symbol-dce) //----- //
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>
 module {
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %18 = arith.extf %in : f16 to f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32x1x1xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.subf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %11 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %12 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %cst_0 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = math.rsqrt %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x10x16384xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {hal.device.targets = [#device_target_hip]} {
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %18 = arith.extf %in : f16 to f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32x1x1xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.subf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %11 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %12 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %cst_0 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = math.rsqrt %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x10x16384xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %18 = arith.extf %in : f16 to f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32x1x1xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.subf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %11 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %12 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %cst_0 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = math.rsqrt %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x10x16384xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %18 = arith.extf %in : f16 to f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32x1x1xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.subf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %11 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %12 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %cst_0 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = math.rsqrt %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x10x16384xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %18 = arith.extf %in : f16 to f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32x1x1xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.subf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %11 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %12 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %cst_0 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = math.rsqrt %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x10x16384xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %18 = arith.extf %in : f16 to f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32x1x1xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.subf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %11 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %12 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %cst_0 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = math.rsqrt %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x1x1xf32>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x10x16384xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32x1x1xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32x1x1xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32x1x1xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32x1x1xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After Convert1X1FilterConv2DToMatmulPass (iree-global-opt-convert-1x1-filter-conv2d-to-matmul) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32x1x1xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertConvToChannelsLastPass (iree-preprocessing-convert-conv-to-channels-last) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32x1x1xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertConvFilterToChannelsLastPass (iree-preprocessing-convert-conv-filter-to-channels-last) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32x1x1xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x1x1xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After Convert1X1FilterConv2DToMatmulPass (iree-global-opt-convert-1x1-filter-conv2d-to-matmul) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertConvToChannelsLastPass (iree-preprocessing-convert-conv-to-channels-last) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertConvFilterToChannelsLastPass (iree-preprocessing-convert-conv-filter-to-channels-last) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %24 = arith.extf %in : f16 to f32
      linalg.yield %24 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %24 = arith.addf %in, %out : f32
      linalg.yield %24 : f32
    } -> tensor<2x32xf32>
    %7 = tensor.empty() : tensor<2x32xf32>
    %8 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %24 = arith.divf %in, %cst_1 : f32
      linalg.yield %24 : f32
    } -> tensor<2x32xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %8 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %24 = arith.subf %in, %in_2 : f32
      linalg.yield %24 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %9 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %24 = arith.mulf %in, %in_2 : f32
      linalg.yield %24 : f32
    } -> tensor<2x32x10x16384xf32>
    %11 = tensor.empty() : tensor<2x32xf32>
    %12 = linalg.fill ins(%cst : f32) outs(%11 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%10 : tensor<2x32x10x16384xf32>) outs(%12 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %24 = arith.addf %in, %out : f32
      linalg.yield %24 : f32
    } -> tensor<2x32xf32>
    %14 = tensor.empty() : tensor<2x32xf32>
    %15 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<2x32xf32>) outs(%14 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %24 = arith.divf %in, %cst_1 : f32
      linalg.yield %24 : f32
    } -> tensor<2x32xf32>
    %16 = tensor.empty() : tensor<2x32xf32>
    %17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<2x32xf32>) outs(%16 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %24 = arith.addf %in, %cst_0 : f32
      linalg.yield %24 : f32
    } -> tensor<2x32xf32>
    %18 = tensor.empty() : tensor<2x32xf32>
    %19 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<2x32xf32>) outs(%18 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %24 = math.rsqrt %in : f32
      linalg.yield %24 : f32
    } -> tensor<2x32xf32>
    %20 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %24 = arith.extf %in : f16 to f32
      %25 = arith.subf %24, %in_2 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%20, %19 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %24 = arith.mulf %in, %in_2 : f32
      linalg.yield %24 : f32
    } -> tensor<2x32x10x16384xf32>
    %22 = hal.tensor.barrier join(%21 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %23 = hal.tensor.export %22 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %23 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %24 = arith.extf %in : f16 to f32
      linalg.yield %24 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %24 = arith.addf %in, %out : f32
      linalg.yield %24 : f32
    } -> tensor<2x32xf32>
    %7 = tensor.empty() : tensor<2x32xf32>
    %8 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %24 = arith.divf %in, %cst_1 : f32
      linalg.yield %24 : f32
    } -> tensor<2x32xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %8 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %24 = arith.subf %in, %in_2 : f32
      linalg.yield %24 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %9 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %24 = arith.mulf %in, %in_2 : f32
      linalg.yield %24 : f32
    } -> tensor<2x32x10x16384xf32>
    %11 = tensor.empty() : tensor<2x32xf32>
    %12 = linalg.fill ins(%cst : f32) outs(%11 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%10 : tensor<2x32x10x16384xf32>) outs(%12 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %24 = arith.addf %in, %out : f32
      linalg.yield %24 : f32
    } -> tensor<2x32xf32>
    %14 = tensor.empty() : tensor<2x32xf32>
    %15 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<2x32xf32>) outs(%14 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %24 = arith.divf %in, %cst_1 : f32
      linalg.yield %24 : f32
    } -> tensor<2x32xf32>
    %16 = tensor.empty() : tensor<2x32xf32>
    %17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<2x32xf32>) outs(%16 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %24 = arith.addf %in, %cst_0 : f32
      linalg.yield %24 : f32
    } -> tensor<2x32xf32>
    %18 = tensor.empty() : tensor<2x32xf32>
    %19 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<2x32xf32>) outs(%18 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %24 = math.rsqrt %in : f32
      linalg.yield %24 : f32
    } -> tensor<2x32xf32>
    %20 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %24 = arith.extf %in : f16 to f32
      %25 = arith.subf %24, %in_2 : f32
      linalg.yield %25 : f32
    } -> tensor<2x32x10x16384xf32>
    %21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%20, %19 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %24 = arith.mulf %in, %in_2 : f32
      linalg.yield %24 : f32
    } -> tensor<2x32x10x16384xf32>
    %22 = hal.tensor.barrier join(%21 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %23 = hal.tensor.export %22 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %23 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CSE (cse) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %18 = arith.extf %in : f16 to f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.subf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %cst_0 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = math.rsqrt %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x10x16384xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After PadToIntrinsicsPass (iree-preprocessing-pad-to-intrinsics) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %18 = arith.extf %in : f16 to f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.subf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %cst_0 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = math.rsqrt %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x10x16384xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After GeneralizeLinalgMatMulPass (iree-preprocessing-generalize-linalg-matmul-experimental) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After AttrBasedPipelinePass (iree-preprocessing-attr-based-pipeline) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After WarnOnUninitializedValuesPass (iree-global-opt-warn-on-uninitialized-values) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After StripDebugOpsPass (iree-util-strip-debug-ops) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After GeneralizeLinalgMatMulPass (iree-preprocessing-generalize-linalg-matmul-experimental) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After AttrBasedPipelinePass (iree-preprocessing-attr-based-pipeline) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After WarnOnUninitializedValuesPass (iree-global-opt-warn-on-uninitialized-values) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After StripDebugOpsPass (iree-util-strip-debug-ops) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %18 = arith.extf %in : f16 to f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.subf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.mulf %in, %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %cst_0 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = math.rsqrt %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x10x16384xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %18 = arith.extf %in : f16 to f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.subf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.mulf %in, %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %cst_0 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = math.rsqrt %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x10x16384xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %18 = arith.extf %in : f16 to f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.subf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.mulf %in, %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %cst_0 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = math.rsqrt %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x10x16384xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After PropagateLinalgTransposePass (iree-global-opt-propagate-linalg-transpose) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After PropagateLinalgTransposePass (iree-global-opt-propagate-linalg-transpose) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %18 = arith.extf %in : f16 to f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.subf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.mulf %in, %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %cst_0 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = math.rsqrt %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x10x16384xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After IPOPass (iree-util-ipo) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %18 = arith.extf %in : f16 to f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.subf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.mulf %in, %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %cst_0 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = math.rsqrt %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x10x16384xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %18 = arith.extf %in : f16 to f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.subf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.mulf %in, %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %cst_0 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = math.rsqrt %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x10x16384xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %18 = arith.extf %in : f16 to f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.subf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.mulf %in, %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %cst_0 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = math.rsqrt %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x10x16384xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %18 = arith.extf %in : f16 to f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.subf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.mulf %in, %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %cst_0 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = math.rsqrt %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x10x16384xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %18 = arith.extf %in : f16 to f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.subf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.mulf %in, %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %cst_0 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = math.rsqrt %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x10x16384xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After IPOPass (iree-util-ipo) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %18 = arith.extf %in : f16 to f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.subf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.mulf %in, %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %cst_0 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = math.rsqrt %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x10x16384xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %18 = arith.extf %in : f16 to f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.subf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.mulf %in, %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %out : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.divf %in, %cst_1 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = arith.addf %in, %cst_0 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %18 = math.rsqrt %in : f32
      linalg.yield %18 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %out: f32):
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x10x16384xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %18 = arith.mulf %in, %in_2 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %17 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %18 = arith.extf %in : f16 to f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.subf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.mulf %in, %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %out : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.divf %in, %cst_1 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = arith.addf %in, %cst_0 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %18 = math.rsqrt %in : f32
    linalg.yield %18 : f32
  } -> tensor<2x32xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_2 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x10x16384xf32>
  %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %18 = arith.mulf %in, %in_2 : f32
    linalg.yield %18 : f32
  } -> tensor<2x32x10x16384xf32>
  %16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %17 : !hal.buffer_view
 }

 // -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 9.99999974E-6 : f32
    %cst_1 = arith.constant 1.638400e+05 : f32
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = tensor.empty() : tensor<2x32x10x16384xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %12 = arith.extf %in : f16 to f32
      linalg.yield %12 : f32
    } -> tensor<2x32x10x16384xf32>
    %4 = tensor.empty() : tensor<2x32xf32>
    %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %12 = arith.addf %in, %out : f32
      linalg.yield %12 : f32
    } -> tensor<2x32xf32>
    %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %12 = arith.divf %in, %cst_1 : f32
      linalg.yield %12 : f32
    } -> tensor<2x32xf32>
    %8 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %12 = arith.subf %in, %in_2 : f32
      %13 = arith.mulf %12, %12 : f32
      %14 = arith.addf %13, %out : f32
      linalg.yield %14 : f32
    } -> tensor<2x32xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
      %12 = arith.divf %in_3, %cst_1 : f32
      %13 = arith.addf %12, %cst_0 : f32
      %14 = math.rsqrt %13 : f32
      %15 = arith.extf %in : f16 to f32
      %16 = arith.subf %15, %in_2 : f32
      %17 = arith.mulf %16, %14 : f32
      linalg.yield %17 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %12 = arith.extf %in : f16 to f32
    linalg.yield %12 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.addf %in, %out : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %12 = arith.divf %in, %cst_1 : f32
    linalg.yield %12 : f32
  } -> tensor<2x32xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %12 = arith.subf %in, %in_2 : f32
    %13 = arith.mulf %12, %12 : f32
    %14 = arith.addf %13, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %12 = arith.divf %in_3, %cst_1 : f32
    %13 = arith.addf %12, %cst_0 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %in_2 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<2x32x10x16384xf32>
  %10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 9.99999974E-6 : f32
  %cst_1 = arith.constant 1.638400e+05 : f32
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = tensor.empty() : tensor<2x32x10x16384xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
  ^bb0(%in: f16, %out: f32):
    %9 = arith.extf %in : f16 to f32
    linalg.yield %9 : f32
  } -> tensor<2x32x10x16384xf32>
  %4 = tensor.empty() : tensor<2x32xf32>
  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %6 = flow.dispatch.region -> (tensor<2x32x10x16384xf32>) {
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %13 = arith.addf %in, %out : f32
      linalg.yield %13 : f32
    } -> tensor<2x32xf32>
    %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %13 = arith.divf %in, %cst_1 : f32
      linalg.yield %13 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %10 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %13 = arith.subf %in, %in_2 : f32
      %14 = arith.mulf %13, %13 : f32
      %15 = arith.addf %14, %out : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %10, %11 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
      %13 = arith.divf %in_3, %cst_1 : f32
      %14 = arith.addf %13, %cst_0 : f32
      %15 = math.rsqrt %14 : f32
      %16 = arith.extf %in : f16 to f32
      %17 = arith.subf %16, %in_2 : f32
      %18 = arith.mulf %17, %15 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    flow.return %12 : tensor<2x32x10x16384xf32>
  }
  %7 = hal.tensor.barrier join(%6 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %8 = hal.tensor.export %7 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %8 : !hal.buffer_view
 }

 // -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = flow.dispatch.region -> (tensor<2x32x10x16384xf32>) {
    %5 = tensor.empty() : tensor<2x32x10x16384xf32>
    %cst = arith.constant 9.99999974E-6 : f32
    %6 = tensor.empty() : tensor<2x32xf32>
    %cst_0 = arith.constant 1.638400e+05 : f32
    %cst_1 = arith.constant 0.000000e+00 : f32
    %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%5 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %13 = arith.extf %in : f16 to f32
      linalg.yield %13 : f32
    } -> tensor<2x32x10x16384xf32>
    %8 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%8 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %13 = arith.addf %in, %out : f32
      linalg.yield %13 : f32
    } -> tensor<2x32xf32>
    %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%6 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %13 = arith.divf %in, %cst_0 : f32
      linalg.yield %13 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7, %10 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %13 = arith.subf %in, %in_2 : f32
      %14 = arith.mulf %13, %13 : f32
      %15 = arith.addf %14, %out : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %10, %11 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
      %13 = arith.divf %in_3, %cst_0 : f32
      %14 = arith.addf %13, %cst : f32
      %15 = math.rsqrt %14 : f32
      %16 = arith.extf %in : f16 to f32
      %17 = arith.subf %16, %in_2 : f32
      %18 = arith.mulf %17, %15 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    flow.return %12 : tensor<2x32x10x16384xf32>
  }
  %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = flow.dispatch.region -> (tensor<2x32x10x16384xf32>) {
    %5 = tensor.empty() : tensor<2x32x10x16384xf32>
    %cst = arith.constant 9.99999974E-6 : f32
    %6 = tensor.empty() : tensor<2x32xf32>
    %cst_0 = arith.constant 1.638400e+05 : f32
    %cst_1 = arith.constant 0.000000e+00 : f32
    %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%5 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %13 = arith.extf %in : f16 to f32
      linalg.yield %13 : f32
    } -> tensor<2x32x10x16384xf32>
    %8 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%8 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %13 = arith.addf %in, %out : f32
      linalg.yield %13 : f32
    } -> tensor<2x32xf32>
    %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%6 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %13 = arith.divf %in, %cst_0 : f32
      linalg.yield %13 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7, %10 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %13 = arith.subf %in, %in_2 : f32
      %14 = arith.mulf %13, %13 : f32
      %15 = arith.addf %14, %out : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %10, %11 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
      %13 = arith.divf %in_3, %cst_0 : f32
      %14 = arith.addf %13, %cst : f32
      %15 = math.rsqrt %14 : f32
      %16 = arith.extf %in : f16 to f32
      %17 = arith.subf %16, %in_2 : f32
      %18 = arith.mulf %17, %15 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    flow.return %12 : tensor<2x32x10x16384xf32>
  }
  %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertEncodingToFlowPass (iree-dispatch-creation-convert-encoding-to-flow) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = flow.dispatch.region -> (tensor<2x32x10x16384xf32>) {
    %5 = tensor.empty() : tensor<2x32x10x16384xf32>
    %cst = arith.constant 9.99999974E-6 : f32
    %6 = tensor.empty() : tensor<2x32xf32>
    %cst_0 = arith.constant 1.638400e+05 : f32
    %cst_1 = arith.constant 0.000000e+00 : f32
    %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%5 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %13 = arith.extf %in : f16 to f32
      linalg.yield %13 : f32
    } -> tensor<2x32x10x16384xf32>
    %8 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%8 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %13 = arith.addf %in, %out : f32
      linalg.yield %13 : f32
    } -> tensor<2x32xf32>
    %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%6 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %13 = arith.divf %in, %cst_0 : f32
      linalg.yield %13 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7, %10 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %13 = arith.subf %in, %in_2 : f32
      %14 = arith.mulf %13, %13 : f32
      %15 = arith.addf %14, %out : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %10, %11 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
      %13 = arith.divf %in_3, %cst_0 : f32
      %14 = arith.addf %13, %cst : f32
      %15 = math.rsqrt %14 : f32
      %16 = arith.extf %in : f16 to f32
      %17 = arith.subf %16, %in_2 : f32
      %18 = arith.mulf %17, %15 : f32
      linalg.yield %18 : f32
    } -> tensor<2x32x10x16384xf32>
    flow.return %12 : tensor<2x32x10x16384xf32>
  }
  %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertEncodingToFlowPass (iree-dispatch-creation-convert-encoding-to-flow) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = flow.dispatch.region -> (tensor<2x32x10x16384xf32>) {
      %5 = tensor.empty() : tensor<2x32x10x16384xf32>
      %cst = arith.constant 9.99999974E-6 : f32
      %6 = tensor.empty() : tensor<2x32xf32>
      %cst_0 = arith.constant 1.638400e+05 : f32
      %cst_1 = arith.constant 0.000000e+00 : f32
      %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%5 : tensor<2x32x10x16384xf32>) {
      ^bb0(%in: f16, %out: f32):
        %13 = arith.extf %in : f16 to f32
        linalg.yield %13 : f32
      } -> tensor<2x32x10x16384xf32>
      %8 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2x32xf32>) -> tensor<2x32xf32>
      %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%8 : tensor<2x32xf32>) {
      ^bb0(%in: f32, %out: f32):
        %13 = arith.addf %in, %out : f32
        linalg.yield %13 : f32
      } -> tensor<2x32xf32>
      %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%6 : tensor<2x32xf32>) {
      ^bb0(%in: f32, %out: f32):
        %13 = arith.divf %in, %cst_0 : f32
        linalg.yield %13 : f32
      } -> tensor<2x32xf32>
      %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7, %10 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %13 = arith.subf %in, %in_2 : f32
        %14 = arith.mulf %13, %13 : f32
        %15 = arith.addf %14, %out : f32
        linalg.yield %15 : f32
      } -> tensor<2x32xf32>
      %12 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %10, %11 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32x10x16384xf32>) {
      ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
        %13 = arith.divf %in_3, %cst_0 : f32
        %14 = arith.addf %13, %cst : f32
        %15 = math.rsqrt %14 : f32
        %16 = arith.extf %in : f16 to f32
        %17 = arith.subf %16, %in_2 : f32
        %18 = arith.mulf %17, %15 : f32
        linalg.yield %18 : f32
      } -> tensor<2x32x10x16384xf32>
      flow.return %12 : tensor<2x32x10x16384xf32>
    }
    %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) {
    %5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
    %6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
    %7 = tensor.empty() : tensor<2x32x10x16384xf32>
    %cst = arith.constant 9.99999974E-6 : f32
    %8 = tensor.empty() : tensor<2x32xf32>
    %cst_0 = arith.constant 1.638400e+05 : f32
    %cst_1 = arith.constant 0.000000e+00 : f32
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %15 = arith.extf %in : f16 to f32
      linalg.yield %15 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.fill ins(%cst_1 : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %15 = arith.addf %in, %out : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %15 = arith.divf %in, %cst_0 : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %15 = arith.subf %in, %in_2 : f32
      %16 = arith.mulf %15, %15 : f32
      %17 = arith.addf %16, %out : f32
      linalg.yield %17 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
      %15 = arith.divf %in_3, %cst_0 : f32
      %16 = arith.addf %15, %cst : f32
      %17 = math.rsqrt %16 : f32
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      %20 = arith.mulf %19, %17 : f32
      linalg.yield %20 : f32
    } -> tensor<2x32x10x16384xf32>
    flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>
    flow.return
  }
  %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 1.638400e+05 : f32
    %cst_1 = arith.constant 9.99999974E-6 : f32
    %5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
    %6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
    %7 = tensor.empty() : tensor<2x32x10x16384xf32>
    %8 = tensor.empty() : tensor<2x32xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %15 = arith.extf %in : f16 to f32
      linalg.yield %15 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %15 = arith.addf %in, %out : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %15 = arith.divf %in, %cst_0 : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %15 = arith.subf %in, %in_2 : f32
      %16 = arith.mulf %15, %15 : f32
      %17 = arith.addf %16, %out : f32
      linalg.yield %17 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
      %15 = arith.divf %in_3, %cst_0 : f32
      %16 = arith.addf %15, %cst_1 : f32
      %17 = math.rsqrt %16 : f32
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      %20 = arith.mulf %19, %17 : f32
      linalg.yield %20 : f32
    } -> tensor<2x32x10x16384xf32>
    flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>
    flow.return
  }
  %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 1.638400e+05 : f32
    %cst_1 = arith.constant 9.99999974E-6 : f32
    %5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
    %6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
    %7 = tensor.empty() : tensor<2x32x10x16384xf32>
    %8 = tensor.empty() : tensor<2x32xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %15 = arith.extf %in : f16 to f32
      linalg.yield %15 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %15 = arith.addf %in, %out : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %15 = arith.divf %in, %cst_0 : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %15 = arith.subf %in, %in_2 : f32
      %16 = arith.mulf %15, %15 : f32
      %17 = arith.addf %16, %out : f32
      linalg.yield %17 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
      %15 = arith.divf %in_3, %cst_0 : f32
      %16 = arith.addf %15, %cst_1 : f32
      %17 = math.rsqrt %16 : f32
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      %20 = arith.mulf %19, %17 : f32
      linalg.yield %20 : f32
    } -> tensor<2x32x10x16384xf32>
    flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>
    flow.return
  }
  %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 1.638400e+05 : f32
    %cst_1 = arith.constant 9.99999974E-6 : f32
    %5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
    %6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
    %7 = tensor.empty() : tensor<2x32x10x16384xf32>
    %8 = tensor.empty() : tensor<2x32xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %15 = arith.extf %in : f16 to f32
      linalg.yield %15 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %15 = arith.addf %in, %out : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %15 = arith.divf %in, %cst_0 : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %15 = arith.subf %in, %in_2 : f32
      %16 = arith.mulf %15, %15 : f32
      %17 = arith.addf %16, %out : f32
      linalg.yield %17 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
      %15 = arith.divf %in_3, %cst_0 : f32
      %16 = arith.addf %15, %cst_1 : f32
      %17 = math.rsqrt %16 : f32
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      %20 = arith.mulf %19, %17 : f32
      linalg.yield %20 : f32
    } -> tensor<2x32x10x16384xf32>
    flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>
    flow.return
  }
  %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 1.638400e+05 : f32
    %cst_1 = arith.constant 9.99999974E-6 : f32
    %5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
    %6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
    %7 = tensor.empty() : tensor<2x32x10x16384xf32>
    %8 = tensor.empty() : tensor<2x32xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %15 = arith.extf %in : f16 to f32
      linalg.yield %15 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %15 = arith.addf %in, %out : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %15 = arith.divf %in, %cst_0 : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %15 = arith.subf %in, %in_2 : f32
      %16 = arith.mulf %15, %15 : f32
      %17 = arith.addf %16, %out : f32
      linalg.yield %17 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
      %15 = arith.divf %in_3, %cst_0 : f32
      %16 = arith.addf %15, %cst_1 : f32
      %17 = math.rsqrt %16 : f32
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      %20 = arith.mulf %19, %17 : f32
      linalg.yield %20 : f32
    } -> tensor<2x32x10x16384xf32>
    flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    flow.return %x, %y, %z : index, index, index
  }
  %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After VerifyInputLegalityPass (iree-verify-input-legality) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> =
        (%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %cst_0 = arith.constant 1.638400e+05 : f32
      %cst_1 = arith.constant 9.99999974E-6 : f32
      %5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
      %6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
      %7 = tensor.empty() : tensor<2x32x10x16384xf32>
      %8 = tensor.empty() : tensor<2x32xf32>
      %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) {
      ^bb0(%in: f16, %out: f32):
        %15 = arith.extf %in : f16 to f32
        linalg.yield %15 : f32
      } -> tensor<2x32x10x16384xf32>
      %10 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32>
      %11 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) {
      ^bb0(%in: f32, %out: f32):
        %15 = arith.addf %in, %out : f32
        linalg.yield %15 : f32
      } -> tensor<2x32xf32>
      %12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
      ^bb0(%in: f32, %out: f32):
        %15 = arith.divf %in, %cst_0 : f32
        linalg.yield %15 : f32
      } -> tensor<2x32xf32>
      %13 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %15 = arith.subf %in, %in_2 : f32
        %16 = arith.mulf %15, %15 : f32
        %17 = arith.addf %16, %out : f32
        linalg.yield %17 : f32
      } -> tensor<2x32xf32>
      %14 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) {
      ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
        %15 = arith.divf %in_3, %cst_0 : f32
        %16 = arith.addf %15, %cst_1 : f32
        %17 = math.rsqrt %16 : f32
        %18 = arith.extf %in : f16 to f32
        %19 = arith.subf %18, %in_2 : f32
        %20 = arith.mulf %19, %17 : f32
        linalg.yield %20 : f32
      } -> tensor<2x32x10x16384xf32>
      flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>
      flow.return
    } count() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After InitializeEmptyTensorsPass (iree-flow-initialize-empty-tensors) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 1.638400e+05 : f32
    %cst_1 = arith.constant 9.99999974E-6 : f32
    %5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
    %6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
    %7 = tensor.empty() : tensor<2x32x10x16384xf32>
    %8 = tensor.empty() : tensor<2x32xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %15 = arith.extf %in : f16 to f32
      linalg.yield %15 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %15 = arith.addf %in, %out : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %15 = arith.divf %in, %cst_0 : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %15 = arith.subf %in, %in_2 : f32
      %16 = arith.mulf %15, %15 : f32
      %17 = arith.addf %16, %out : f32
      linalg.yield %17 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
      %15 = arith.divf %in_3, %cst_0 : f32
      %16 = arith.addf %15, %cst_1 : f32
      %17 = math.rsqrt %16 : f32
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      %20 = arith.mulf %19, %17 : f32
      linalg.yield %20 : f32
    } -> tensor<2x32x10x16384xf32>
    flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    flow.return %x, %y, %z : index, index, index
  }
  %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After CaptureDynamicDimsPass (iree-flow-capture-dynamic-dims) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 1.638400e+05 : f32
    %cst_1 = arith.constant 9.99999974E-6 : f32
    %5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
    %6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
    %7 = tensor.empty() : tensor<2x32x10x16384xf32>
    %8 = tensor.empty() : tensor<2x32xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %15 = arith.extf %in : f16 to f32
      linalg.yield %15 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %15 = arith.addf %in, %out : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %15 = arith.divf %in, %cst_0 : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %15 = arith.subf %in, %in_2 : f32
      %16 = arith.mulf %15, %15 : f32
      %17 = arith.addf %16, %out : f32
      linalg.yield %17 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
      %15 = arith.divf %in_3, %cst_0 : f32
      %16 = arith.addf %15, %cst_1 : f32
      %17 = math.rsqrt %16 : f32
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      %20 = arith.mulf %19, %17 : f32
      linalg.yield %20 : f32
    } -> tensor<2x32x10x16384xf32>
    flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    flow.return %x, %y, %z : index, index, index
  }
  %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 1.638400e+05 : f32
    %cst_1 = arith.constant 9.99999974E-6 : f32
    %5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
    %6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
    %7 = tensor.empty() : tensor<2x32x10x16384xf32>
    %8 = tensor.empty() : tensor<2x32xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %15 = arith.extf %in : f16 to f32
      linalg.yield %15 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %15 = arith.addf %in, %out : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %15 = arith.divf %in, %cst_0 : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %15 = arith.subf %in, %in_2 : f32
      %16 = arith.mulf %15, %15 : f32
      %17 = arith.addf %16, %out : f32
      linalg.yield %17 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
      %15 = arith.divf %in_3, %cst_0 : f32
      %16 = arith.addf %15, %cst_1 : f32
      %17 = math.rsqrt %16 : f32
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      %20 = arith.mulf %19, %17 : f32
      linalg.yield %20 : f32
    } -> tensor<2x32x10x16384xf32>
    flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    flow.return %x, %y, %z : index, index, index
  }
  %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> =
      (%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 1.638400e+05 : f32
    %cst_1 = arith.constant 9.99999974E-6 : f32
    %5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
    %6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
    %7 = tensor.empty() : tensor<2x32x10x16384xf32>
    %8 = tensor.empty() : tensor<2x32xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %out: f32):
      %15 = arith.extf %in : f16 to f32
      linalg.yield %15 : f32
    } -> tensor<2x32x10x16384xf32>
    %10 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %15 = arith.addf %in, %out : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %15 = arith.divf %in, %cst_0 : f32
      linalg.yield %15 : f32
    } -> tensor<2x32xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %15 = arith.subf %in, %in_2 : f32
      %16 = arith.mulf %15, %15 : f32
      %17 = arith.addf %16, %out : f32
      linalg.yield %17 : f32
    } -> tensor<2x32xf32>
    %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) {
    ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
      %15 = arith.divf %in_3, %cst_0 : f32
      %16 = arith.addf %15, %cst_1 : f32
      %17 = math.rsqrt %16 : f32
      %18 = arith.extf %in : f16 to f32
      %19 = arith.subf %18, %in_2 : f32
      %20 = arith.mulf %19, %17 : f32
      linalg.yield %20 : f32
    } -> tensor<2x32x10x16384xf32>
    flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    flow.return %x, %y, %z : index, index, index
  }
  %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After InitializeEmptyTensorsPass (iree-flow-initialize-empty-tensors) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CaptureDynamicDimsPass (iree-flow-capture-dynamic-dims) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After OutlineDispatchExternsPass (iree-flow-outline-dispatch-externs) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> =
        (%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %cst_0 = arith.constant 1.638400e+05 : f32
      %cst_1 = arith.constant 9.99999974E-6 : f32
      %5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
      %6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
      %7 = tensor.empty() : tensor<2x32x10x16384xf32>
      %8 = tensor.empty() : tensor<2x32xf32>
      %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) {
      ^bb0(%in: f16, %out: f32):
        %15 = arith.extf %in : f16 to f32
        linalg.yield %15 : f32
      } -> tensor<2x32x10x16384xf32>
      %10 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32>
      %11 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) {
      ^bb0(%in: f32, %out: f32):
        %15 = arith.addf %in, %out : f32
        linalg.yield %15 : f32
      } -> tensor<2x32xf32>
      %12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
      ^bb0(%in: f32, %out: f32):
        %15 = arith.divf %in, %cst_0 : f32
        linalg.yield %15 : f32
      } -> tensor<2x32xf32>
      %13 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %15 = arith.subf %in, %in_2 : f32
        %16 = arith.mulf %15, %15 : f32
        %17 = arith.addf %16, %out : f32
        linalg.yield %17 : f32
      } -> tensor<2x32xf32>
      %14 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) {
      ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
        %15 = arith.divf %in_3, %cst_0 : f32
        %16 = arith.addf %15, %cst_1 : f32
        %17 = math.rsqrt %16 : f32
        %18 = arith.extf %in : f16 to f32
        %19 = arith.subf %18, %in_2 : f32
        %20 = arith.mulf %19, %17 : f32
        linalg.yield %20 : f32
      } -> tensor<2x32x10x16384xf32>
      flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>
      flow.return
    } count() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After OutlineDispatchRegionsPass (iree-flow-outline-dispatch-regions) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  flow.executable private @main$async_dispatch_0 {
    flow.executable.export public @main$async_dispatch_0 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main$async_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %cst_0 = arith.constant 1.638400e+05 : f32
        %cst_1 = arith.constant 9.99999974E-6 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
        %2 = tensor.empty() : tensor<2x32x10x16384xf32>
        %3 = tensor.empty() : tensor<2x32xf32>
        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
        ^bb0(%in: f16, %out: f32):
          %10 = arith.extf %in : f16 to f32
          linalg.yield %10 : f32
        } -> tensor<2x32x10x16384xf32>
        %5 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x32xf32>) -> tensor<2x32xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
        ^bb0(%in: f32, %out: f32):
          %10 = arith.addf %in, %out : f32
          linalg.yield %10 : f32
        } -> tensor<2x32xf32>
        %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%3 : tensor<2x32xf32>) {
        ^bb0(%in: f32, %out: f32):
          %10 = arith.divf %in, %cst_0 : f32
          linalg.yield %10 : f32
        } -> tensor<2x32xf32>
        %8 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
        ^bb0(%in: f32, %in_2: f32, %out: f32):
          %10 = arith.subf %in, %in_2 : f32
          %11 = arith.mulf %10, %10 : f32
          %12 = arith.addf %11, %out : f32
          linalg.yield %12 : f32
        } -> tensor<2x32xf32>
        %9 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
        ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
          %10 = arith.divf %in_3, %cst_0 : f32
          %11 = arith.addf %10, %cst_1 : f32
          %12 = math.rsqrt %11 : f32
          %13 = arith.extf %in : f16 to f32
          %14 = arith.subf %13, %in_2 : f32
          %15 = arith.mulf %14, %12 : f32
          linalg.yield %15 : f32
        } -> tensor<2x32x10x16384xf32>
        flow.dispatch.tensor.store %9, %arg2, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>
        return
      }
    }
  }
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32>
    %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After AnnotateDispatchesPass (iree-flow-annotate-dispatches) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  flow.executable private @main$async_dispatch_0 {
    flow.executable.export public @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%arg0: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %cst_0 = arith.constant 1.638400e+05 : f32
        %cst_1 = arith.constant 9.99999974E-6 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
        %2 = tensor.empty() : tensor<2x32x10x16384xf32>
        %3 = tensor.empty() : tensor<2x32xf32>
        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
        ^bb0(%in: f16, %out: f32):
          %10 = arith.extf %in : f16 to f32
          linalg.yield %10 : f32
        } -> tensor<2x32x10x16384xf32>
        %5 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x32xf32>) -> tensor<2x32xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
        ^bb0(%in: f32, %out: f32):
          %10 = arith.addf %in, %out : f32
          linalg.yield %10 : f32
        } -> tensor<2x32xf32>
        %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%3 : tensor<2x32xf32>) {
        ^bb0(%in: f32, %out: f32):
          %10 = arith.divf %in, %cst_0 : f32
          linalg.yield %10 : f32
        } -> tensor<2x32xf32>
        %8 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
        ^bb0(%in: f32, %in_2: f32, %out: f32):
          %10 = arith.subf %in, %in_2 : f32
          %11 = arith.mulf %10, %10 : f32
          %12 = arith.addf %11, %out : f32
          linalg.yield %12 : f32
        } -> tensor<2x32xf32>
        %9 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
        ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
          %10 = arith.divf %in_3, %cst_0 : f32
          %11 = arith.addf %10, %cst_1 : f32
          %12 = math.rsqrt %11 : f32
          %13 = arith.extf %in : f16 to f32
          %14 = arith.subf %13, %in_2 : f32
          %15 = arith.mulf %14, %12 : f32
          linalg.yield %15 : f32
        } -> tensor<2x32x10x16384xf32>
        flow.dispatch.tensor.store %9, %arg2, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>
        return
      }
    }
  }
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32>
    %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After StripDebugOpsPass (iree-util-strip-debug-ops) //----- //
 flow.executable private @main$async_dispatch_0 {
  flow.executable.export public @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32 workgroups() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%arg0: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %cst_0 = arith.constant 1.638400e+05 : f32
      %cst_1 = arith.constant 9.99999974E-6 : f32
      %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
      %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
      %2 = tensor.empty() : tensor<2x32x10x16384xf32>
      %3 = tensor.empty() : tensor<2x32xf32>
      %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
      ^bb0(%in: f16, %out: f32):
        %10 = arith.extf %in : f16 to f32
        linalg.yield %10 : f32
      } -> tensor<2x32x10x16384xf32>
      %5 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x32xf32>) -> tensor<2x32xf32>
      %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
      ^bb0(%in: f32, %out: f32):
        %10 = arith.addf %in, %out : f32
        linalg.yield %10 : f32
      } -> tensor<2x32xf32>
      %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%3 : tensor<2x32xf32>) {
      ^bb0(%in: f32, %out: f32):
        %10 = arith.divf %in, %cst_0 : f32
        linalg.yield %10 : f32
      } -> tensor<2x32xf32>
      %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %10 = arith.subf %in, %in_2 : f32
        %11 = arith.mulf %10, %10 : f32
        %12 = arith.addf %11, %out : f32
        linalg.yield %12 : f32
      } -> tensor<2x32xf32>
      %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
      ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
        %10 = arith.divf %in_3, %cst_0 : f32
        %11 = arith.addf %10, %cst_1 : f32
        %12 = math.rsqrt %11 : f32
        %13 = arith.extf %in : f16 to f32
        %14 = arith.subf %13, %in_2 : f32
        %15 = arith.mulf %14, %12 : f32
        linalg.yield %15 : f32
      } -> tensor<2x32x10x16384xf32>
      flow.dispatch.tensor.store %9, %arg2, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>
      return
    }
  }
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32>
  %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After DeduplicateExecutablesPass (iree-flow-deduplicate-executables) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  flow.executable private @main$async_dispatch_0 {
    flow.executable.export public @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%arg0: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %cst_0 = arith.constant 1.638400e+05 : f32
        %cst_1 = arith.constant 9.99999974E-6 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
        %2 = tensor.empty() : tensor<2x32x10x16384xf32>
        %3 = tensor.empty() : tensor<2x32xf32>
        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
        ^bb0(%in: f16, %out: f32):
          %10 = arith.extf %in : f16 to f32
          linalg.yield %10 : f32
        } -> tensor<2x32x10x16384xf32>
        %5 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x32xf32>) -> tensor<2x32xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
        ^bb0(%in: f32, %out: f32):
          %10 = arith.addf %in, %out : f32
          linalg.yield %10 : f32
        } -> tensor<2x32xf32>
        %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%3 : tensor<2x32xf32>) {
        ^bb0(%in: f32, %out: f32):
          %10 = arith.divf %in, %cst_0 : f32
          linalg.yield %10 : f32
        } -> tensor<2x32xf32>
        %8 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
        ^bb0(%in: f32, %in_2: f32, %out: f32):
          %10 = arith.subf %in, %in_2 : f32
          %11 = arith.mulf %10, %10 : f32
          %12 = arith.addf %11, %out : f32
          linalg.yield %12 : f32
        } -> tensor<2x32xf32>
        %9 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
        ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
          %10 = arith.divf %in_3, %cst_0 : f32
          %11 = arith.addf %10, %cst_1 : f32
          %12 = math.rsqrt %11 : f32
          %13 = arith.extf %in : f16 to f32
          %14 = arith.subf %13, %in_2 : f32
          %15 = arith.mulf %14, %12 : f32
          linalg.yield %15 : f32
        } -> tensor<2x32x10x16384xf32>
        flow.dispatch.tensor.store %9, %arg2, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>
        return
      }
    }
  }
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32>
    %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32>
  %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After CleanupTensorShapesPass (iree-flow-cleanup-tensor-shapes) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32>
  %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CleanupTensorShapesPass (iree-flow-cleanup-tensor-shapes) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After OutlineConstantsPass (iree-flow-outline-constants) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  flow.executable private @main$async_dispatch_0 {
    flow.executable.export public @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%arg0: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %cst_0 = arith.constant 1.638400e+05 : f32
        %cst_1 = arith.constant 9.99999974E-6 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
        %2 = tensor.empty() : tensor<2x32x10x16384xf32>
        %3 = tensor.empty() : tensor<2x32xf32>
        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
        ^bb0(%in: f16, %out: f32):
          %10 = arith.extf %in : f16 to f32
          linalg.yield %10 : f32
        } -> tensor<2x32x10x16384xf32>
        %5 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x32xf32>) -> tensor<2x32xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
        ^bb0(%in: f32, %out: f32):
          %10 = arith.addf %in, %out : f32
          linalg.yield %10 : f32
        } -> tensor<2x32xf32>
        %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%3 : tensor<2x32xf32>) {
        ^bb0(%in: f32, %out: f32):
          %10 = arith.divf %in, %cst_0 : f32
          linalg.yield %10 : f32
        } -> tensor<2x32xf32>
        %8 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
        ^bb0(%in: f32, %in_2: f32, %out: f32):
          %10 = arith.subf %in, %in_2 : f32
          %11 = arith.mulf %10, %10 : f32
          %12 = arith.addf %11, %out : f32
          linalg.yield %12 : f32
        } -> tensor<2x32xf32>
        %9 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
        ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
          %10 = arith.divf %in_3, %cst_0 : f32
          %11 = arith.addf %10, %cst_1 : f32
          %12 = math.rsqrt %11 : f32
          %13 = arith.extf %in : f16 to f32
          %14 = arith.subf %13, %in_2 : f32
          %15 = arith.mulf %14, %12 : f32
          linalg.yield %15 : f32
        } -> tensor<2x32x10x16384xf32>
        flow.dispatch.tensor.store %9, %arg2, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>
        return
      }
    }
  }
  util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
    %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
    %2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32>
    %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
    %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
    util.return %4 : !hal.buffer_view
  }
  util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = util.null : !hal.fence
    %c-1_i32 = arith.constant -1 : i32
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
    util.return %1 : !hal.buffer_view
  }
 }


 // -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32>
  %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32>
  %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32>
  %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32>
  %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
 util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
  %0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16>
  %2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32>
  %3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence
  %4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- //
 util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = util.null : !hal.fence
  %c-1_i32 = arith.constant -1 : i32
  %c0 = arith.constant 0 : index
  %device_0 = hal.devices.get %c0 : !hal.device
  %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
  %1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
  %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  flow.executable private @main$async_dispatch_0 {
    flow.executable.export public @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%arg0: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %cst_0 = arith.constant 1.638400e+05 : f32
        %cst_1 = arith.constant 9.99999974E-6 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16>
        %2 = tensor.empty() : tensor<2x32x10x16384xf32>
        %3 = tensor.empty() : tensor<2x32xf32>
        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) {
        ^bb0(%in: f16, %out: f32):
          %10 = arith.extf %in : f16 to f32
          linalg.yield %10 : f32
        } -> tensor<2x32x10x16384xf32>
        %5 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x32xf32>) -> tensor<2x32xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) {
        ^bb0(%in: f32, %out: f32):
          %10 = arith.addf %in, %out : f32
          linalg.yield %10 : f32
        } -> tensor<2x32xf32>
        %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%3 : tensor<2x32xf32>) {
        ^bb0(%in: f32, %out: f32):
          %10 = arith.divf %in, %cst_0 : f32
          linalg.yield %10 : f32
        } -> tensor<2x32xf32>
        %8 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
        ^bb0(%in: f32, %in_2: f32, %out: f32):
          %10 = arith.subf %in, %in_2 : f32
          %11 = arith.mulf %10, %10 : f32
          %12 = arith.addf %11, %out : f32
          linalg.yield %12 : f32
        } -> tensor<2x32xf32>
        %9 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) {
        ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
          %10 = arith.divf %in_3, %cst_0 : f32
          %11 = arith.addf %10, %cst_1 : f32
          %12 = math.rsqrt %11 : f32
          %13 = arith.extf %in : f16 to f32
          %14 = ari
diff --git a/test.mlir b/test.mlir