Created
April 9, 2025 02:55
-
-
Save pashu123/c4494eeb95c995de448b3960a4cea9fb to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After BindSymbolicShapesPass (torch-iree-bind-symbolic-shapes) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%int6 = torch.constant.int 6 | |
%0 = torch.prims.convert_element_type %arg0, %int6 : !torch.vtensor<[2,32,10,16384],f16>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32> | |
%int2 = torch.constant.int 2 | |
%int3 = torch.constant.int 3 | |
%1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int> | |
%int0 = torch.constant.int 0 | |
%true = torch.constant.bool true | |
%result0, %result1 = torch.aten.var_mean.correction %0, %1, %int0, %true : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[2,32,1,1],f32>, !torch.vtensor<[2,32,1,1],f32> | |
%float1.000000e-05 = torch.constant.float 1.000000e-05 | |
%int1 = torch.constant.int 1 | |
%2 = torch.aten.add.Scalar %result0, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32> | |
%3 = torch.aten.rsqrt %2 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32> | |
%int1_0 = torch.constant.int 1 | |
%4 = torch.aten.sub.Tensor %arg1, %result1, %int1_0 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32> | |
%5 = torch.aten.mul.Tensor %4, %3 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %5 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After SetStrictSymbolicShapesPass (torch-iree-set-strict-symbolic-shapes) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%int6 = torch.constant.int 6 | |
%0 = torch.prims.convert_element_type %arg0, %int6 : !torch.vtensor<[2,32,10,16384],f16>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32> | |
%int2 = torch.constant.int 2 | |
%int3 = torch.constant.int 3 | |
%1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int> | |
%int0 = torch.constant.int 0 | |
%true = torch.constant.bool true | |
%result0, %result1 = torch.aten.var_mean.correction %0, %1, %int0, %true : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[2,32,1,1],f32>, !torch.vtensor<[2,32,1,1],f32> | |
%float1.000000e-05 = torch.constant.float 1.000000e-05 | |
%int1 = torch.constant.int 1 | |
%2 = torch.aten.add.Scalar %result0, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32> | |
%3 = torch.aten.rsqrt %2 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32> | |
%int1_0 = torch.constant.int 1 | |
%4 = torch.aten.sub.Tensor %arg1, %result1, %int1_0 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32> | |
%5 = torch.aten.mul.Tensor %4, %3 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %5 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%int1 = torch.constant.int 1 | |
%float1.000000e-05 = torch.constant.float 1.000000e-05 | |
%true = torch.constant.bool true | |
%int0 = torch.constant.int 0 | |
%int3 = torch.constant.int 3 | |
%int2 = torch.constant.int 2 | |
%int6 = torch.constant.int 6 | |
%0 = torch.prims.convert_element_type %arg0, %int6 : !torch.vtensor<[2,32,10,16384],f16>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32> | |
%1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int> | |
%result0, %result1 = torch.aten.var_mean.correction %0, %1, %int0, %true : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[2,32,1,1],f32>, !torch.vtensor<[2,32,1,1],f32> | |
%2 = torch.aten.add.Scalar %result0, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32> | |
%3 = torch.aten.rsqrt %2 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32> | |
%4 = torch.aten.sub.Tensor %arg1, %result1, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32> | |
%5 = torch.aten.mul.Tensor %4, %3 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %5 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After BitCastQuantTensorPass (torch-iree-bitcast-quant-tensor) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%int1 = torch.constant.int 1 | |
%float1.000000e-05 = torch.constant.float 1.000000e-05 | |
%true = torch.constant.bool true | |
%int0 = torch.constant.int 0 | |
%int3 = torch.constant.int 3 | |
%int2 = torch.constant.int 2 | |
%int6 = torch.constant.int 6 | |
%0 = torch.prims.convert_element_type %arg0, %int6 : !torch.vtensor<[2,32,10,16384],f16>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32> | |
%1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int> | |
%result0, %result1 = torch.aten.var_mean.correction %0, %1, %int0, %true : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[2,32,1,1],f32>, !torch.vtensor<[2,32,1,1],f32> | |
%2 = torch.aten.add.Scalar %result0, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32> | |
%3 = torch.aten.rsqrt %2 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32> | |
%4 = torch.aten.sub.Tensor %arg1, %result1, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32> | |
%5 = torch.aten.mul.Tensor %4, %3 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %5 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After ReduceOpVariants (torch-reduce-op-variants) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%int1 = torch.constant.int 1 | |
%float1.000000e-05 = torch.constant.float 1.000000e-05 | |
%true = torch.constant.bool true | |
%int0 = torch.constant.int 0 | |
%int3 = torch.constant.int 3 | |
%int2 = torch.constant.int 2 | |
%int6 = torch.constant.int 6 | |
%0 = torch.prims.convert_element_type %arg0, %int6 : !torch.vtensor<[2,32,10,16384],f16>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32> | |
%1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int> | |
%result0, %result1 = torch.aten.var_mean.correction %0, %1, %int0, %true : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[2,32,1,1],f32>, !torch.vtensor<[2,32,1,1],f32> | |
%2 = torch.aten.add.Scalar %result0, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32> | |
%3 = torch.aten.rsqrt %2 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32> | |
%4 = torch.aten.sub.Tensor %arg1, %result1, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32> | |
%5 = torch.aten.mul.Tensor %4, %3 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %5 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After ConvertCustomQuantOp (torch-convert-custom-quant-op) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%int1 = torch.constant.int 1 | |
%float1.000000e-05 = torch.constant.float 1.000000e-05 | |
%true = torch.constant.bool true | |
%int0 = torch.constant.int 0 | |
%int3 = torch.constant.int 3 | |
%int2 = torch.constant.int 2 | |
%int6 = torch.constant.int 6 | |
%0 = torch.prims.convert_element_type %arg0, %int6 : !torch.vtensor<[2,32,10,16384],f16>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32> | |
%1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int> | |
%result0, %result1 = torch.aten.var_mean.correction %0, %1, %int0, %true : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[2,32,1,1],f32>, !torch.vtensor<[2,32,1,1],f32> | |
%2 = torch.aten.add.Scalar %result0, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32> | |
%3 = torch.aten.rsqrt %2 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32> | |
%4 = torch.aten.sub.Tensor %arg1, %result1, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32> | |
%5 = torch.aten.mul.Tensor %4, %3 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %5 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After DecomposeComplexOps (torch-decompose-complex-ops) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%float1.000000e00 = torch.constant.float 1.000000e+00 | |
%int163840 = torch.constant.int 163840 | |
%int7 = torch.constant.int 7 | |
%int1 = torch.constant.int 1 | |
%float1.000000e-05 = torch.constant.float 1.000000e-05 | |
%true = torch.constant.bool true | |
%int3 = torch.constant.int 3 | |
%int2 = torch.constant.int 2 | |
%int6 = torch.constant.int 6 | |
%false = torch.constant.bool false | |
%none = torch.constant.none | |
%0 = torch.aten.to.dtype %arg0, %int6, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f32> | |
%1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int> | |
%2 = torch.aten.to.dtype %0, %int7, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f64> | |
%3 = torch.aten.sum.dim_IntList %2, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64> | |
%4 = torch.aten.div.Scalar %3, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64> | |
%5 = torch.aten.sub.Tensor %2, %4, %float1.000000e00 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,1,1],f64>, !torch.float -> !torch.vtensor<[2,32,10,16384],f64> | |
%6 = torch.aten.mul.Tensor %5, %5 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,10,16384],f64> -> !torch.vtensor<[2,32,10,16384],f64> | |
%7 = torch.aten.sum.dim_IntList %6, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64> | |
%8 = torch.aten.div.Scalar %7, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64> | |
%9 = torch.aten.to.dtype %8, %int6, %false, %false, %none : !torch.vtensor<[2,32,1,1],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32> | |
%10 = torch.aten.sum.dim_IntList %0, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32> | |
%11 = torch.aten.div.Scalar %10, %int163840 : !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,1,1],f32> | |
%12 = torch.aten.add.Scalar %9, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32> | |
%13 = torch.aten.rsqrt %12 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32> | |
%14 = torch.aten.sub.Tensor %arg1, %11, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32> | |
%15 = torch.aten.mul.Tensor %14, %13 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %15 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After FuseQuantizedOps (torch-fuse-quantized-ops) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%float1.000000e00 = torch.constant.float 1.000000e+00 | |
%int163840 = torch.constant.int 163840 | |
%int7 = torch.constant.int 7 | |
%int1 = torch.constant.int 1 | |
%float1.000000e-05 = torch.constant.float 1.000000e-05 | |
%true = torch.constant.bool true | |
%int3 = torch.constant.int 3 | |
%int2 = torch.constant.int 2 | |
%int6 = torch.constant.int 6 | |
%false = torch.constant.bool false | |
%none = torch.constant.none | |
%0 = torch.aten.to.dtype %arg0, %int6, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f32> | |
%1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int> | |
%2 = torch.aten.to.dtype %0, %int7, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f64> | |
%3 = torch.aten.sum.dim_IntList %2, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64> | |
%4 = torch.aten.div.Scalar %3, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64> | |
%5 = torch.aten.sub.Tensor %2, %4, %float1.000000e00 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,1,1],f64>, !torch.float -> !torch.vtensor<[2,32,10,16384],f64> | |
%6 = torch.aten.mul.Tensor %5, %5 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,10,16384],f64> -> !torch.vtensor<[2,32,10,16384],f64> | |
%7 = torch.aten.sum.dim_IntList %6, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64> | |
%8 = torch.aten.div.Scalar %7, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64> | |
%9 = torch.aten.to.dtype %8, %int6, %false, %false, %none : !torch.vtensor<[2,32,1,1],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32> | |
%10 = torch.aten.sum.dim_IntList %0, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32> | |
%11 = torch.aten.div.Scalar %10, %int163840 : !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,1,1],f32> | |
%12 = torch.aten.add.Scalar %9, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32> | |
%13 = torch.aten.rsqrt %12 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32> | |
%14 = torch.aten.sub.Tensor %arg1, %11, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32> | |
%15 = torch.aten.mul.Tensor %14, %13 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %15 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%float1.000000e00 = torch.constant.float 1.000000e+00 | |
%int163840 = torch.constant.int 163840 | |
%int7 = torch.constant.int 7 | |
%int1 = torch.constant.int 1 | |
%float1.000000e-05 = torch.constant.float 1.000000e-05 | |
%true = torch.constant.bool true | |
%int3 = torch.constant.int 3 | |
%int2 = torch.constant.int 2 | |
%int6 = torch.constant.int 6 | |
%false = torch.constant.bool false | |
%none = torch.constant.none | |
%0 = torch.aten.to.dtype %arg0, %int6, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f32> | |
%1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int> | |
%2 = torch.aten.to.dtype %0, %int7, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f64> | |
%3 = torch.aten.sum.dim_IntList %2, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64> | |
%4 = torch.aten.div.Scalar %3, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64> | |
%5 = torch.aten.sub.Tensor %2, %4, %float1.000000e00 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,1,1],f64>, !torch.float -> !torch.vtensor<[2,32,10,16384],f64> | |
%6 = torch.aten.mul.Tensor %5, %5 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,10,16384],f64> -> !torch.vtensor<[2,32,10,16384],f64> | |
%7 = torch.aten.sum.dim_IntList %6, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64> | |
%8 = torch.aten.div.Scalar %7, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64> | |
%9 = torch.aten.to.dtype %8, %int6, %false, %false, %none : !torch.vtensor<[2,32,1,1],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32> | |
%10 = torch.aten.sum.dim_IntList %0, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32> | |
%11 = torch.aten.div.Scalar %10, %int163840 : !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,1,1],f32> | |
%12 = torch.aten.add.Scalar %9, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32> | |
%13 = torch.aten.rsqrt %12 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32> | |
%14 = torch.aten.sub.Tensor %arg1, %11, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32> | |
%15 = torch.aten.mul.Tensor %14, %13 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %15 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After ScalarizeShapes (torch-scalarize-shapes) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%float1.000000e00 = torch.constant.float 1.000000e+00 | |
%int163840 = torch.constant.int 163840 | |
%int7 = torch.constant.int 7 | |
%int1 = torch.constant.int 1 | |
%float1.000000e-05 = torch.constant.float 1.000000e-05 | |
%true = torch.constant.bool true | |
%int3 = torch.constant.int 3 | |
%int2 = torch.constant.int 2 | |
%int6 = torch.constant.int 6 | |
%false = torch.constant.bool false | |
%none = torch.constant.none | |
%0 = torch.aten.to.dtype %arg0, %int6, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f32> | |
%1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int> | |
%2 = torch.aten.to.dtype %0, %int7, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f64> | |
%3 = torch.aten.sum.dim_IntList %2, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64> | |
%4 = torch.aten.div.Scalar %3, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64> | |
%5 = torch.aten.sub.Tensor %2, %4, %float1.000000e00 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,1,1],f64>, !torch.float -> !torch.vtensor<[2,32,10,16384],f64> | |
%6 = torch.aten.mul.Tensor %5, %5 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,10,16384],f64> -> !torch.vtensor<[2,32,10,16384],f64> | |
%7 = torch.aten.sum.dim_IntList %6, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64> | |
%8 = torch.aten.div.Scalar %7, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64> | |
%9 = torch.aten.to.dtype %8, %int6, %false, %false, %none : !torch.vtensor<[2,32,1,1],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32> | |
%10 = torch.aten.sum.dim_IntList %0, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32> | |
%11 = torch.aten.div.Scalar %10, %int163840 : !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,1,1],f32> | |
%12 = torch.aten.add.Scalar %9, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32> | |
%13 = torch.aten.rsqrt %12 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32> | |
%14 = torch.aten.sub.Tensor %arg1, %11, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32> | |
%15 = torch.aten.mul.Tensor %14, %13 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %15 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After ConvertTorchToTMTensor (convert-torch-to-tmtensor) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%float1.000000e00 = torch.constant.float 1.000000e+00 | |
%int163840 = torch.constant.int 163840 | |
%int7 = torch.constant.int 7 | |
%int1 = torch.constant.int 1 | |
%float1.000000e-05 = torch.constant.float 1.000000e-05 | |
%true = torch.constant.bool true | |
%int3 = torch.constant.int 3 | |
%int2 = torch.constant.int 2 | |
%int6 = torch.constant.int 6 | |
%false = torch.constant.bool false | |
%none = torch.constant.none | |
%0 = torch.aten.to.dtype %arg0, %int6, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f32> | |
%1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int> | |
%2 = torch.aten.to.dtype %0, %int7, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f64> | |
%3 = torch.aten.sum.dim_IntList %2, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64> | |
%4 = torch.aten.div.Scalar %3, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64> | |
%5 = torch.aten.sub.Tensor %2, %4, %float1.000000e00 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,1,1],f64>, !torch.float -> !torch.vtensor<[2,32,10,16384],f64> | |
%6 = torch.aten.mul.Tensor %5, %5 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,10,16384],f64> -> !torch.vtensor<[2,32,10,16384],f64> | |
%7 = torch.aten.sum.dim_IntList %6, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64> | |
%8 = torch.aten.div.Scalar %7, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64> | |
%9 = torch.aten.to.dtype %8, %int6, %false, %false, %none : !torch.vtensor<[2,32,1,1],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32> | |
%10 = torch.aten.sum.dim_IntList %0, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32> | |
%11 = torch.aten.div.Scalar %10, %int163840 : !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,1,1],f32> | |
%12 = torch.aten.add.Scalar %9, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32> | |
%13 = torch.aten.rsqrt %12 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32> | |
%14 = torch.aten.sub.Tensor %arg1, %11, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32> | |
%15 = torch.aten.mul.Tensor %14, %13 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %15 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After ConvertTMTensorToLinalgExtPass (torch-iree-tm-tensor-to-linalg-ext) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%float1.000000e00 = torch.constant.float 1.000000e+00 | |
%int163840 = torch.constant.int 163840 | |
%int7 = torch.constant.int 7 | |
%int1 = torch.constant.int 1 | |
%float1.000000e-05 = torch.constant.float 1.000000e-05 | |
%true = torch.constant.bool true | |
%int3 = torch.constant.int 3 | |
%int2 = torch.constant.int 2 | |
%int6 = torch.constant.int 6 | |
%false = torch.constant.bool false | |
%none = torch.constant.none | |
%0 = torch.aten.to.dtype %arg0, %int6, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f32> | |
%1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int> | |
%2 = torch.aten.to.dtype %0, %int7, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f64> | |
%3 = torch.aten.sum.dim_IntList %2, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64> | |
%4 = torch.aten.div.Scalar %3, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64> | |
%5 = torch.aten.sub.Tensor %2, %4, %float1.000000e00 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,1,1],f64>, !torch.float -> !torch.vtensor<[2,32,10,16384],f64> | |
%6 = torch.aten.mul.Tensor %5, %5 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,10,16384],f64> -> !torch.vtensor<[2,32,10,16384],f64> | |
%7 = torch.aten.sum.dim_IntList %6, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64> | |
%8 = torch.aten.div.Scalar %7, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64> | |
%9 = torch.aten.to.dtype %8, %int6, %false, %false, %none : !torch.vtensor<[2,32,1,1],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32> | |
%10 = torch.aten.sum.dim_IntList %0, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32> | |
%11 = torch.aten.div.Scalar %10, %int163840 : !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,1,1],f32> | |
%12 = torch.aten.add.Scalar %9, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32> | |
%13 = torch.aten.rsqrt %12 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32> | |
%14 = torch.aten.sub.Tensor %arg1, %11, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32> | |
%15 = torch.aten.mul.Tensor %14, %13 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %15 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After ConvertTorchToTensor (convert-torch-to-tensor) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%float1.000000e00 = torch.constant.float 1.000000e+00 | |
%int163840 = torch.constant.int 163840 | |
%int7 = torch.constant.int 7 | |
%int1 = torch.constant.int 1 | |
%float1.000000e-05 = torch.constant.float 1.000000e-05 | |
%true = torch.constant.bool true | |
%int3 = torch.constant.int 3 | |
%int2 = torch.constant.int 2 | |
%int6 = torch.constant.int 6 | |
%false = torch.constant.bool false | |
%none = torch.constant.none | |
%0 = torch.aten.to.dtype %arg0, %int6, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f32> | |
%1 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int> | |
%2 = torch.aten.to.dtype %0, %int7, %false, %false, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,10,16384],f64> | |
%3 = torch.aten.sum.dim_IntList %2, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64> | |
%4 = torch.aten.div.Scalar %3, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64> | |
%5 = torch.aten.sub.Tensor %2, %4, %float1.000000e00 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,1,1],f64>, !torch.float -> !torch.vtensor<[2,32,10,16384],f64> | |
%6 = torch.aten.mul.Tensor %5, %5 : !torch.vtensor<[2,32,10,16384],f64>, !torch.vtensor<[2,32,10,16384],f64> -> !torch.vtensor<[2,32,10,16384],f64> | |
%7 = torch.aten.sum.dim_IntList %6, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f64>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f64> | |
%8 = torch.aten.div.Scalar %7, %int163840 : !torch.vtensor<[2,32,1,1],f64>, !torch.int -> !torch.vtensor<[2,32,1,1],f64> | |
%9 = torch.aten.to.dtype %8, %int6, %false, %false, %none : !torch.vtensor<[2,32,1,1],f64>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32> | |
%10 = torch.aten.sum.dim_IntList %0, %1, %true, %none : !torch.vtensor<[2,32,10,16384],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,32,1,1],f32> | |
%11 = torch.aten.div.Scalar %10, %int163840 : !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,1,1],f32> | |
%12 = torch.aten.add.Scalar %9, %float1.000000e-05, %int1 : !torch.vtensor<[2,32,1,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[2,32,1,1],f32> | |
%13 = torch.aten.rsqrt %12 : !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,1,1],f32> | |
%14 = torch.aten.sub.Tensor %arg1, %11, %int1 : !torch.vtensor<[2,32,10,16384],f16>, !torch.vtensor<[2,32,1,1],f32>, !torch.int -> !torch.vtensor<[2,32,10,16384],f32> | |
%15 = torch.aten.mul.Tensor %14, %13 : !torch.vtensor<[2,32,10,16384],f32>, !torch.vtensor<[2,32,1,1],f32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %15 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After ConvertTorchToLinalg (convert-torch-to-linalg) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%float1.000000e00 = torch.constant.float 1.000000e+00 | |
%2 = torch_c.to_f64 %float1.000000e00 | |
%int163840 = torch.constant.int 163840 | |
%c163840_i64 = arith.constant 163840 : i64 | |
%int7 = torch.constant.int 7 | |
%int1 = torch.constant.int 1 | |
%c1_i64 = arith.constant 1 : i64 | |
%float1.000000e-05 = torch.constant.float 1.000000e-05 | |
%3 = torch_c.to_f64 %float1.000000e-05 | |
%true = torch.constant.bool true | |
%int3 = torch.constant.int 3 | |
%int2 = torch.constant.int 2 | |
%int6 = torch.constant.int 6 | |
%false = torch.constant.bool false | |
%none = torch.constant.none | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c2 = arith.constant 2 : index | |
%c1_0 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c2_1 = arith.constant 2 : index | |
%c10 = arith.constant 10 : index | |
%c3 = arith.constant 3 : index | |
%c16384 = arith.constant 16384 : index | |
%4 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%39 = arith.extf %in : f16 to f32 | |
linalg.yield %39 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%cast = tensor.cast %5 : tensor<2x32x10x16384xf32> to tensor<2x32x10x16384xf32> | |
%6 = torch.prim.ListConstruct %int2, %int3 : (!torch.int, !torch.int) -> !torch.list<int> | |
%c1_2 = arith.constant 1 : index | |
%c0_3 = arith.constant 0 : index | |
%c2_4 = arith.constant 2 : index | |
%c1_5 = arith.constant 1 : index | |
%c32_6 = arith.constant 32 : index | |
%c2_7 = arith.constant 2 : index | |
%c10_8 = arith.constant 10 : index | |
%c3_9 = arith.constant 3 : index | |
%c16384_10 = arith.constant 16384 : index | |
%7 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast : tensor<2x32x10x16384xf32>) outs(%7 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f32, %out: f64): | |
%39 = arith.extf %in : f32 to f64 | |
linalg.yield %39 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%cast_11 = tensor.cast %8 : tensor<2x32x10x16384xf64> to tensor<2x32x10x16384xf64> | |
%cst = arith.constant 0.000000e+00 : f64 | |
%c1_12 = arith.constant 1 : index | |
%c0_13 = arith.constant 0 : index | |
%dim = tensor.dim %cast_11, %c0_13 : tensor<2x32x10x16384xf64> | |
%c1_14 = arith.constant 1 : index | |
%dim_15 = tensor.dim %cast_11, %c1_14 : tensor<2x32x10x16384xf64> | |
%c2_16 = arith.constant 2 : index | |
%dim_17 = tensor.dim %cast_11, %c2_16 : tensor<2x32x10x16384xf64> | |
%c3_18 = arith.constant 3 : index | |
%dim_19 = tensor.dim %cast_11, %c3_18 : tensor<2x32x10x16384xf64> | |
%9 = tensor.empty(%dim, %dim_15) : tensor<?x?x1x1xf64> | |
%10 = linalg.fill ins(%cst : f64) outs(%9 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%cast_11 : tensor<2x32x10x16384xf64>) outs(%10 : tensor<?x?x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%39 = arith.addf %in, %out : f64 | |
linalg.yield %39 : f64 | |
} -> tensor<?x?x1x1xf64> | |
%cast_20 = tensor.cast %11 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64> | |
%c1_21 = arith.constant 1 : index | |
%c0_22 = arith.constant 0 : index | |
%c2_23 = arith.constant 2 : index | |
%c1_24 = arith.constant 1 : index | |
%c32_25 = arith.constant 32 : index | |
%c2_26 = arith.constant 2 : index | |
%c1_27 = arith.constant 1 : index | |
%c3_28 = arith.constant 3 : index | |
%c1_29 = arith.constant 1 : index | |
%12 = tensor.empty() : tensor<2x32x1x1xf64> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_20 : tensor<2x32x1x1xf64>) outs(%12 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%39 = arith.sitofp %c163840_i64 : i64 to f64 | |
%40 = arith.divf %in, %39 : f64 | |
linalg.yield %40 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%cast_30 = tensor.cast %13 : tensor<2x32x1x1xf64> to tensor<2x32x1x1xf64> | |
%c1_31 = arith.constant 1 : index | |
%c0_32 = arith.constant 0 : index | |
%c2_33 = arith.constant 2 : index | |
%c1_34 = arith.constant 1 : index | |
%c32_35 = arith.constant 32 : index | |
%c2_36 = arith.constant 2 : index | |
%c10_37 = arith.constant 10 : index | |
%c3_38 = arith.constant 3 : index | |
%c16384_39 = arith.constant 16384 : index | |
%c0_40 = arith.constant 0 : index | |
%c2_41 = arith.constant 2 : index | |
%c1_42 = arith.constant 1 : index | |
%c32_43 = arith.constant 32 : index | |
%14 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_11, %cast_30 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%14 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_163: f64, %out: f64): | |
%39 = arith.mulf %in_163, %2 : f64 | |
%40 = arith.subf %in, %39 : f64 | |
linalg.yield %40 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%cast_44 = tensor.cast %15 : tensor<2x32x10x16384xf64> to tensor<2x32x10x16384xf64> | |
%c1_45 = arith.constant 1 : index | |
%c0_46 = arith.constant 0 : index | |
%c2_47 = arith.constant 2 : index | |
%c1_48 = arith.constant 1 : index | |
%c32_49 = arith.constant 32 : index | |
%c2_50 = arith.constant 2 : index | |
%c10_51 = arith.constant 10 : index | |
%c3_52 = arith.constant 3 : index | |
%c16384_53 = arith.constant 16384 : index | |
%c0_54 = arith.constant 0 : index | |
%c2_55 = arith.constant 2 : index | |
%c1_56 = arith.constant 1 : index | |
%c32_57 = arith.constant 32 : index | |
%c2_58 = arith.constant 2 : index | |
%c10_59 = arith.constant 10 : index | |
%c3_60 = arith.constant 3 : index | |
%c16384_61 = arith.constant 16384 : index | |
%16 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_44, %cast_44 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%16 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_163: f64, %out: f64): | |
%39 = arith.mulf %in, %in_163 : f64 | |
linalg.yield %39 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%cast_62 = tensor.cast %17 : tensor<2x32x10x16384xf64> to tensor<2x32x10x16384xf64> | |
%cst_63 = arith.constant 0.000000e+00 : f64 | |
%c1_64 = arith.constant 1 : index | |
%c0_65 = arith.constant 0 : index | |
%dim_66 = tensor.dim %cast_62, %c0_65 : tensor<2x32x10x16384xf64> | |
%c1_67 = arith.constant 1 : index | |
%dim_68 = tensor.dim %cast_62, %c1_67 : tensor<2x32x10x16384xf64> | |
%c2_69 = arith.constant 2 : index | |
%dim_70 = tensor.dim %cast_62, %c2_69 : tensor<2x32x10x16384xf64> | |
%c3_71 = arith.constant 3 : index | |
%dim_72 = tensor.dim %cast_62, %c3_71 : tensor<2x32x10x16384xf64> | |
%18 = tensor.empty(%dim_66, %dim_68) : tensor<?x?x1x1xf64> | |
%19 = linalg.fill ins(%cst_63 : f64) outs(%18 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%cast_62 : tensor<2x32x10x16384xf64>) outs(%19 : tensor<?x?x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%39 = arith.addf %in, %out : f64 | |
linalg.yield %39 : f64 | |
} -> tensor<?x?x1x1xf64> | |
%cast_73 = tensor.cast %20 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64> | |
%c1_74 = arith.constant 1 : index | |
%c0_75 = arith.constant 0 : index | |
%c2_76 = arith.constant 2 : index | |
%c1_77 = arith.constant 1 : index | |
%c32_78 = arith.constant 32 : index | |
%c2_79 = arith.constant 2 : index | |
%c1_80 = arith.constant 1 : index | |
%c3_81 = arith.constant 3 : index | |
%c1_82 = arith.constant 1 : index | |
%21 = tensor.empty() : tensor<2x32x1x1xf64> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_73 : tensor<2x32x1x1xf64>) outs(%21 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%39 = arith.sitofp %c163840_i64 : i64 to f64 | |
%40 = arith.divf %in, %39 : f64 | |
linalg.yield %40 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%cast_83 = tensor.cast %22 : tensor<2x32x1x1xf64> to tensor<2x32x1x1xf64> | |
%c1_84 = arith.constant 1 : index | |
%c0_85 = arith.constant 0 : index | |
%c2_86 = arith.constant 2 : index | |
%c1_87 = arith.constant 1 : index | |
%c32_88 = arith.constant 32 : index | |
%c2_89 = arith.constant 2 : index | |
%c1_90 = arith.constant 1 : index | |
%c3_91 = arith.constant 3 : index | |
%c1_92 = arith.constant 1 : index | |
%23 = tensor.empty() : tensor<2x32x1x1xf32> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_83 : tensor<2x32x1x1xf64>) outs(%23 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f64, %out: f32): | |
%39 = arith.truncf %in : f64 to f32 | |
linalg.yield %39 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%cast_93 = tensor.cast %24 : tensor<2x32x1x1xf32> to tensor<2x32x1x1xf32> | |
%cst_94 = arith.constant 0.000000e+00 : f32 | |
%c1_95 = arith.constant 1 : index | |
%c0_96 = arith.constant 0 : index | |
%dim_97 = tensor.dim %cast, %c0_96 : tensor<2x32x10x16384xf32> | |
%c1_98 = arith.constant 1 : index | |
%dim_99 = tensor.dim %cast, %c1_98 : tensor<2x32x10x16384xf32> | |
%c2_100 = arith.constant 2 : index | |
%dim_101 = tensor.dim %cast, %c2_100 : tensor<2x32x10x16384xf32> | |
%c3_102 = arith.constant 3 : index | |
%dim_103 = tensor.dim %cast, %c3_102 : tensor<2x32x10x16384xf32> | |
%25 = tensor.empty(%dim_97, %dim_99) : tensor<?x?x1x1xf32> | |
%26 = linalg.fill ins(%cst_94 : f32) outs(%25 : tensor<?x?x1x1xf32>) -> tensor<?x?x1x1xf32> | |
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%cast : tensor<2x32x10x16384xf32>) outs(%26 : tensor<?x?x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%39 = arith.addf %in, %out : f32 | |
linalg.yield %39 : f32 | |
} -> tensor<?x?x1x1xf32> | |
%cast_104 = tensor.cast %27 : tensor<?x?x1x1xf32> to tensor<2x32x1x1xf32> | |
%c1_105 = arith.constant 1 : index | |
%c0_106 = arith.constant 0 : index | |
%c2_107 = arith.constant 2 : index | |
%c1_108 = arith.constant 1 : index | |
%c32_109 = arith.constant 32 : index | |
%c2_110 = arith.constant 2 : index | |
%c1_111 = arith.constant 1 : index | |
%c3_112 = arith.constant 3 : index | |
%c1_113 = arith.constant 1 : index | |
%28 = tensor.empty() : tensor<2x32x1x1xf32> | |
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_104 : tensor<2x32x1x1xf32>) outs(%28 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%39 = arith.sitofp %c163840_i64 : i64 to f32 | |
%40 = arith.divf %in, %39 : f32 | |
linalg.yield %40 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%cast_114 = tensor.cast %29 : tensor<2x32x1x1xf32> to tensor<2x32x1x1xf32> | |
%c1_115 = arith.constant 1 : index | |
%c0_116 = arith.constant 0 : index | |
%c2_117 = arith.constant 2 : index | |
%c1_118 = arith.constant 1 : index | |
%c32_119 = arith.constant 32 : index | |
%c2_120 = arith.constant 2 : index | |
%c1_121 = arith.constant 1 : index | |
%c3_122 = arith.constant 3 : index | |
%c1_123 = arith.constant 1 : index | |
%30 = tensor.empty() : tensor<2x32x1x1xf32> | |
%31 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_93 : tensor<2x32x1x1xf32>) outs(%30 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%39 = arith.truncf %3 : f64 to f32 | |
%40 = arith.sitofp %c1_i64 : i64 to f32 | |
%41 = arith.mulf %39, %40 : f32 | |
%42 = arith.addf %in, %41 : f32 | |
linalg.yield %42 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%cast_124 = tensor.cast %31 : tensor<2x32x1x1xf32> to tensor<2x32x1x1xf32> | |
%c1_125 = arith.constant 1 : index | |
%c0_126 = arith.constant 0 : index | |
%c2_127 = arith.constant 2 : index | |
%c1_128 = arith.constant 1 : index | |
%c32_129 = arith.constant 32 : index | |
%c2_130 = arith.constant 2 : index | |
%c1_131 = arith.constant 1 : index | |
%c3_132 = arith.constant 3 : index | |
%c1_133 = arith.constant 1 : index | |
%32 = tensor.empty() : tensor<2x32x1x1xf32> | |
%33 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_124 : tensor<2x32x1x1xf32>) outs(%32 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%39 = math.rsqrt %in : f32 | |
linalg.yield %39 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%cast_134 = tensor.cast %33 : tensor<2x32x1x1xf32> to tensor<2x32x1x1xf32> | |
%c1_135 = arith.constant 1 : index | |
%c0_136 = arith.constant 0 : index | |
%c2_137 = arith.constant 2 : index | |
%c1_138 = arith.constant 1 : index | |
%c32_139 = arith.constant 32 : index | |
%c2_140 = arith.constant 2 : index | |
%c10_141 = arith.constant 10 : index | |
%c3_142 = arith.constant 3 : index | |
%c16384_143 = arith.constant 16384 : index | |
%c0_144 = arith.constant 0 : index | |
%c2_145 = arith.constant 2 : index | |
%c1_146 = arith.constant 1 : index | |
%c32_147 = arith.constant 32 : index | |
%34 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %cast_114 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%34 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_163: f32, %out: f32): | |
%39 = arith.extf %in : f16 to f32 | |
%40 = arith.sitofp %c1_i64 : i64 to f32 | |
%41 = arith.mulf %in_163, %40 : f32 | |
%42 = arith.subf %39, %41 : f32 | |
linalg.yield %42 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%cast_148 = tensor.cast %35 : tensor<2x32x10x16384xf32> to tensor<2x32x10x16384xf32> | |
%c1_149 = arith.constant 1 : index | |
%c0_150 = arith.constant 0 : index | |
%c2_151 = arith.constant 2 : index | |
%c1_152 = arith.constant 1 : index | |
%c32_153 = arith.constant 32 : index | |
%c2_154 = arith.constant 2 : index | |
%c10_155 = arith.constant 10 : index | |
%c3_156 = arith.constant 3 : index | |
%c16384_157 = arith.constant 16384 : index | |
%c0_158 = arith.constant 0 : index | |
%c2_159 = arith.constant 2 : index | |
%c1_160 = arith.constant 1 : index | |
%c32_161 = arith.constant 32 : index | |
%36 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_148, %cast_134 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%36 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_163: f32, %out: f32): | |
%39 = arith.mulf %in, %in_163 : f32 | |
linalg.yield %39 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%cast_162 = tensor.cast %37 : tensor<2x32x10x16384xf32> to tensor<2x32x10x16384xf32> | |
%38 = torch_c.from_builtin_tensor %cast_162 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %38 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%float1.000000e00 = torch.constant.float 1.000000e+00 | |
%2 = torch_c.to_f64 %float1.000000e00 | |
%c163840_i64 = arith.constant 163840 : i64 | |
%c1_i64 = arith.constant 1 : i64 | |
%float1.000000e-05 = torch.constant.float 1.000000e-05 | |
%3 = torch_c.to_f64 %float1.000000e-05 | |
%int3 = torch.constant.int 3 | |
%int2 = torch.constant.int 2 | |
%4 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%30 = arith.extf %in : f16 to f32 | |
linalg.yield %30 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%cast = tensor.cast %5 : tensor<2x32x10x16384xf32> to tensor<2x32x10x16384xf32> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast : tensor<2x32x10x16384xf32>) outs(%6 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f32, %out: f64): | |
%30 = arith.extf %in : f32 to f64 | |
linalg.yield %30 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%cast_0 = tensor.cast %7 : tensor<2x32x10x16384xf64> to tensor<2x32x10x16384xf64> | |
%cst = arith.constant 0.000000e+00 : f64 | |
%c0 = arith.constant 0 : index | |
%dim = tensor.dim %cast_0, %c0 : tensor<2x32x10x16384xf64> | |
%c1 = arith.constant 1 : index | |
%dim_1 = tensor.dim %cast_0, %c1 : tensor<2x32x10x16384xf64> | |
%c2 = arith.constant 2 : index | |
%c3 = arith.constant 3 : index | |
%8 = tensor.empty(%dim, %dim_1) : tensor<?x?x1x1xf64> | |
%9 = linalg.fill ins(%cst : f64) outs(%8 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%cast_0 : tensor<2x32x10x16384xf64>) outs(%9 : tensor<?x?x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%30 = arith.addf %in, %out : f64 | |
linalg.yield %30 : f64 | |
} -> tensor<?x?x1x1xf64> | |
%cast_2 = tensor.cast %10 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64> | |
%11 = tensor.empty() : tensor<2x32x1x1xf64> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_2 : tensor<2x32x1x1xf64>) outs(%11 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%30 = arith.sitofp %c163840_i64 : i64 to f64 | |
%31 = arith.divf %in, %30 : f64 | |
linalg.yield %31 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%cast_3 = tensor.cast %12 : tensor<2x32x1x1xf64> to tensor<2x32x1x1xf64> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_0, %cast_3 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_20: f64, %out: f64): | |
%30 = arith.mulf %in_20, %2 : f64 | |
%31 = arith.subf %in, %30 : f64 | |
linalg.yield %31 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%cast_4 = tensor.cast %13 : tensor<2x32x10x16384xf64> to tensor<2x32x10x16384xf64> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_4, %cast_4 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%6 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_20: f64, %out: f64): | |
%30 = arith.mulf %in, %in_20 : f64 | |
linalg.yield %30 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%cast_5 = tensor.cast %14 : tensor<2x32x10x16384xf64> to tensor<2x32x10x16384xf64> | |
%dim_6 = tensor.dim %cast_5, %c0 : tensor<2x32x10x16384xf64> | |
%dim_7 = tensor.dim %cast_5, %c1 : tensor<2x32x10x16384xf64> | |
%15 = tensor.empty(%dim_6, %dim_7) : tensor<?x?x1x1xf64> | |
%16 = linalg.fill ins(%cst : f64) outs(%15 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%cast_5 : tensor<2x32x10x16384xf64>) outs(%16 : tensor<?x?x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%30 = arith.addf %in, %out : f64 | |
linalg.yield %30 : f64 | |
} -> tensor<?x?x1x1xf64> | |
%cast_8 = tensor.cast %17 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_8 : tensor<2x32x1x1xf64>) outs(%11 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%30 = arith.sitofp %c163840_i64 : i64 to f64 | |
%31 = arith.divf %in, %30 : f64 | |
linalg.yield %31 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%cast_9 = tensor.cast %18 : tensor<2x32x1x1xf64> to tensor<2x32x1x1xf64> | |
%19 = tensor.empty() : tensor<2x32x1x1xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_9 : tensor<2x32x1x1xf64>) outs(%19 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f64, %out: f32): | |
%30 = arith.truncf %in : f64 to f32 | |
linalg.yield %30 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%cast_10 = tensor.cast %20 : tensor<2x32x1x1xf32> to tensor<2x32x1x1xf32> | |
%cst_11 = arith.constant 0.000000e+00 : f32 | |
%dim_12 = tensor.dim %cast, %c0 : tensor<2x32x10x16384xf32> | |
%dim_13 = tensor.dim %cast, %c1 : tensor<2x32x10x16384xf32> | |
%21 = tensor.empty(%dim_12, %dim_13) : tensor<?x?x1x1xf32> | |
%22 = linalg.fill ins(%cst_11 : f32) outs(%21 : tensor<?x?x1x1xf32>) -> tensor<?x?x1x1xf32> | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%cast : tensor<2x32x10x16384xf32>) outs(%22 : tensor<?x?x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%30 = arith.addf %in, %out : f32 | |
linalg.yield %30 : f32 | |
} -> tensor<?x?x1x1xf32> | |
%cast_14 = tensor.cast %23 : tensor<?x?x1x1xf32> to tensor<2x32x1x1xf32> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_14 : tensor<2x32x1x1xf32>) outs(%19 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%30 = arith.sitofp %c163840_i64 : i64 to f32 | |
%31 = arith.divf %in, %30 : f32 | |
linalg.yield %31 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%cast_15 = tensor.cast %24 : tensor<2x32x1x1xf32> to tensor<2x32x1x1xf32> | |
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_10 : tensor<2x32x1x1xf32>) outs(%19 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%30 = arith.truncf %3 : f64 to f32 | |
%31 = arith.sitofp %c1_i64 : i64 to f32 | |
%32 = arith.mulf %30, %31 : f32 | |
%33 = arith.addf %in, %32 : f32 | |
linalg.yield %33 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%cast_16 = tensor.cast %25 : tensor<2x32x1x1xf32> to tensor<2x32x1x1xf32> | |
%26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_16 : tensor<2x32x1x1xf32>) outs(%19 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%30 = math.rsqrt %in : f32 | |
linalg.yield %30 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%cast_17 = tensor.cast %26 : tensor<2x32x1x1xf32> to tensor<2x32x1x1xf32> | |
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %cast_15 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_20: f32, %out: f32): | |
%30 = arith.extf %in : f16 to f32 | |
%31 = arith.sitofp %c1_i64 : i64 to f32 | |
%32 = arith.mulf %in_20, %31 : f32 | |
%33 = arith.subf %30, %32 : f32 | |
linalg.yield %33 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%cast_18 = tensor.cast %27 : tensor<2x32x10x16384xf32> to tensor<2x32x10x16384xf32> | |
%28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_18, %cast_17 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_20: f32, %out: f32): | |
%30 = arith.mulf %in, %in_20 : f32 | |
linalg.yield %30 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%cast_19 = tensor.cast %28 : tensor<2x32x10x16384xf32> to tensor<2x32x10x16384xf32> | |
%29 = torch_c.from_builtin_tensor %cast_19 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %29 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After ConvertTorchToSCF (convert-torch-to-scf) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%float1.000000e00 = torch.constant.float 1.000000e+00 | |
%2 = torch_c.to_f64 %float1.000000e00 | |
%c163840_i64 = arith.constant 163840 : i64 | |
%c1_i64 = arith.constant 1 : i64 | |
%float1.000000e-05 = torch.constant.float 1.000000e-05 | |
%3 = torch_c.to_f64 %float1.000000e-05 | |
%int3 = torch.constant.int 3 | |
%int2 = torch.constant.int 2 | |
%4 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%30 = arith.extf %in : f16 to f32 | |
linalg.yield %30 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf32>) outs(%6 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f32, %out: f64): | |
%30 = arith.extf %in : f32 to f64 | |
linalg.yield %30 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%cst = arith.constant 0.000000e+00 : f64 | |
%c0 = arith.constant 0 : index | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c2_0 = arith.constant 2 : index | |
%c3 = arith.constant 3 : index | |
%8 = tensor.empty(%c2, %c32) : tensor<?x?x1x1xf64> | |
%9 = linalg.fill ins(%cst : f64) outs(%8 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf64>) outs(%9 : tensor<?x?x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%30 = arith.addf %in, %out : f64 | |
linalg.yield %30 : f64 | |
} -> tensor<?x?x1x1xf64> | |
%cast = tensor.cast %10 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64> | |
%11 = tensor.empty() : tensor<2x32x1x1xf64> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast : tensor<2x32x1x1xf64>) outs(%11 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%30 = arith.sitofp %c163840_i64 : i64 to f64 | |
%31 = arith.divf %in, %30 : f64 | |
linalg.yield %31 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7, %12 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_8: f64, %out: f64): | |
%30 = arith.mulf %in_8, %2 : f64 | |
%31 = arith.subf %in, %30 : f64 | |
linalg.yield %31 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13, %13 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%6 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_8: f64, %out: f64): | |
%30 = arith.mulf %in, %in_8 : f64 | |
linalg.yield %30 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%c2_1 = arith.constant 2 : index | |
%c32_2 = arith.constant 32 : index | |
%15 = tensor.empty(%c2_1, %c32_2) : tensor<?x?x1x1xf64> | |
%16 = linalg.fill ins(%cst : f64) outs(%15 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%14 : tensor<2x32x10x16384xf64>) outs(%16 : tensor<?x?x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%30 = arith.addf %in, %out : f64 | |
linalg.yield %30 : f64 | |
} -> tensor<?x?x1x1xf64> | |
%cast_3 = tensor.cast %17 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_3 : tensor<2x32x1x1xf64>) outs(%11 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%30 = arith.sitofp %c163840_i64 : i64 to f64 | |
%31 = arith.divf %in, %30 : f64 | |
linalg.yield %31 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%19 = tensor.empty() : tensor<2x32x1x1xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18 : tensor<2x32x1x1xf64>) outs(%19 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f64, %out: f32): | |
%30 = arith.truncf %in : f64 to f32 | |
linalg.yield %30 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%cst_4 = arith.constant 0.000000e+00 : f32 | |
%c2_5 = arith.constant 2 : index | |
%c32_6 = arith.constant 32 : index | |
%21 = tensor.empty(%c2_5, %c32_6) : tensor<?x?x1x1xf32> | |
%22 = linalg.fill ins(%cst_4 : f32) outs(%21 : tensor<?x?x1x1xf32>) -> tensor<?x?x1x1xf32> | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf32>) outs(%22 : tensor<?x?x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%30 = arith.addf %in, %out : f32 | |
linalg.yield %30 : f32 | |
} -> tensor<?x?x1x1xf32> | |
%cast_7 = tensor.cast %23 : tensor<?x?x1x1xf32> to tensor<2x32x1x1xf32> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_7 : tensor<2x32x1x1xf32>) outs(%19 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%30 = arith.sitofp %c163840_i64 : i64 to f32 | |
%31 = arith.divf %in, %30 : f32 | |
linalg.yield %31 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%20 : tensor<2x32x1x1xf32>) outs(%19 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%30 = arith.truncf %3 : f64 to f32 | |
%31 = arith.sitofp %c1_i64 : i64 to f32 | |
%32 = arith.mulf %30, %31 : f32 | |
%33 = arith.addf %in, %32 : f32 | |
linalg.yield %33 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%25 : tensor<2x32x1x1xf32>) outs(%19 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%30 = math.rsqrt %in : f32 | |
linalg.yield %30 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %24 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_8: f32, %out: f32): | |
%30 = arith.extf %in : f16 to f32 | |
%31 = arith.sitofp %c1_i64 : i64 to f32 | |
%32 = arith.mulf %in_8, %31 : f32 | |
%33 = arith.subf %30, %32 : f32 | |
linalg.yield %33 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%27, %26 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_8: f32, %out: f32): | |
%30 = arith.mulf %in, %in_8 : f32 | |
linalg.yield %30 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%29 = torch_c.from_builtin_tensor %28 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %29 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After ConvertTorchToArith (convert-torch-to-arith) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%cst = arith.constant 1.000000e+00 : f64 | |
%2 = torch_c.from_f64 %cst | |
%3 = torch_c.to_f64 %2 | |
%c163840_i64 = arith.constant 163840 : i64 | |
%c1_i64 = arith.constant 1 : i64 | |
%cst_0 = arith.constant 1.000000e-05 : f64 | |
%4 = torch_c.from_f64 %cst_0 | |
%5 = torch_c.to_f64 %4 | |
%c3_i64 = arith.constant 3 : i64 | |
%c2_i64 = arith.constant 2 : i64 | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%32 = arith.extf %in : f16 to f32 | |
linalg.yield %32 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%8 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%8 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f32, %out: f64): | |
%32 = arith.extf %in : f32 to f64 | |
linalg.yield %32 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%cst_1 = arith.constant 0.000000e+00 : f64 | |
%c0 = arith.constant 0 : index | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c2_2 = arith.constant 2 : index | |
%c3 = arith.constant 3 : index | |
%10 = tensor.empty(%c2, %c32) : tensor<?x?x1x1xf64> | |
%11 = linalg.fill ins(%cst_1 : f64) outs(%10 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf64>) outs(%11 : tensor<?x?x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%32 = arith.addf %in, %out : f64 | |
linalg.yield %32 : f64 | |
} -> tensor<?x?x1x1xf64> | |
%cast = tensor.cast %12 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64> | |
%13 = tensor.empty() : tensor<2x32x1x1xf64> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast : tensor<2x32x1x1xf64>) outs(%13 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%32 = arith.sitofp %c163840_i64 : i64 to f64 | |
%33 = arith.divf %in, %32 : f64 | |
linalg.yield %33 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %14 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%8 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_10: f64, %out: f64): | |
%32 = arith.mulf %in_10, %3 : f64 | |
%33 = arith.subf %in, %32 : f64 | |
linalg.yield %33 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15, %15 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%8 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_10: f64, %out: f64): | |
%32 = arith.mulf %in, %in_10 : f64 | |
linalg.yield %32 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%c2_3 = arith.constant 2 : index | |
%c32_4 = arith.constant 32 : index | |
%17 = tensor.empty(%c2_3, %c32_4) : tensor<?x?x1x1xf64> | |
%18 = linalg.fill ins(%cst_1 : f64) outs(%17 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%16 : tensor<2x32x10x16384xf64>) outs(%18 : tensor<?x?x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%32 = arith.addf %in, %out : f64 | |
linalg.yield %32 : f64 | |
} -> tensor<?x?x1x1xf64> | |
%cast_5 = tensor.cast %19 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_5 : tensor<2x32x1x1xf64>) outs(%13 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%32 = arith.sitofp %c163840_i64 : i64 to f64 | |
%33 = arith.divf %in, %32 : f64 | |
linalg.yield %33 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%21 = tensor.empty() : tensor<2x32x1x1xf32> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%20 : tensor<2x32x1x1xf64>) outs(%21 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f64, %out: f32): | |
%32 = arith.truncf %in : f64 to f32 | |
linalg.yield %32 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%cst_6 = arith.constant 0.000000e+00 : f32 | |
%c2_7 = arith.constant 2 : index | |
%c32_8 = arith.constant 32 : index | |
%23 = tensor.empty(%c2_7, %c32_8) : tensor<?x?x1x1xf32> | |
%24 = linalg.fill ins(%cst_6 : f32) outs(%23 : tensor<?x?x1x1xf32>) -> tensor<?x?x1x1xf32> | |
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%24 : tensor<?x?x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%32 = arith.addf %in, %out : f32 | |
linalg.yield %32 : f32 | |
} -> tensor<?x?x1x1xf32> | |
%cast_9 = tensor.cast %25 : tensor<?x?x1x1xf32> to tensor<2x32x1x1xf32> | |
%26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_9 : tensor<2x32x1x1xf32>) outs(%21 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%32 = arith.sitofp %c163840_i64 : i64 to f32 | |
%33 = arith.divf %in, %32 : f32 | |
linalg.yield %33 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22 : tensor<2x32x1x1xf32>) outs(%21 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%32 = arith.truncf %5 : f64 to f32 | |
%33 = arith.sitofp %c1_i64 : i64 to f32 | |
%34 = arith.mulf %32, %33 : f32 | |
%35 = arith.addf %in, %34 : f32 | |
linalg.yield %35 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%27 : tensor<2x32x1x1xf32>) outs(%21 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%32 = math.rsqrt %in : f32 | |
linalg.yield %32 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %26 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_10: f32, %out: f32): | |
%32 = arith.extf %in : f16 to f32 | |
%33 = arith.sitofp %c1_i64 : i64 to f32 | |
%34 = arith.mulf %in_10, %33 : f32 | |
%35 = arith.subf %32, %34 : f32 | |
linalg.yield %35 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %28 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_10: f32, %out: f32): | |
%32 = arith.mulf %in, %in_10 : f32 | |
linalg.yield %32 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%31 = torch_c.from_builtin_tensor %30 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %31 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After ConvertTorchConversionToMLProgram (convert-torch-conversion-to-mlprogram) //----- // | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)> | |
module { | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%cst = arith.constant 1.000000e+00 : f64 | |
%2 = torch_c.from_f64 %cst | |
%3 = torch_c.to_f64 %2 | |
%c163840_i64 = arith.constant 163840 : i64 | |
%c1_i64 = arith.constant 1 : i64 | |
%cst_0 = arith.constant 1.000000e-05 : f64 | |
%4 = torch_c.from_f64 %cst_0 | |
%5 = torch_c.to_f64 %4 | |
%c3_i64 = arith.constant 3 : i64 | |
%c2_i64 = arith.constant 2 : i64 | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%32 = arith.extf %in : f16 to f32 | |
linalg.yield %32 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%8 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%8 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f32, %out: f64): | |
%32 = arith.extf %in : f32 to f64 | |
linalg.yield %32 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%cst_1 = arith.constant 0.000000e+00 : f64 | |
%c0 = arith.constant 0 : index | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c2_2 = arith.constant 2 : index | |
%c3 = arith.constant 3 : index | |
%10 = tensor.empty(%c2, %c32) : tensor<?x?x1x1xf64> | |
%11 = linalg.fill ins(%cst_1 : f64) outs(%10 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64> | |
%12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf64>) outs(%11 : tensor<?x?x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%32 = arith.addf %in, %out : f64 | |
linalg.yield %32 : f64 | |
} -> tensor<?x?x1x1xf64> | |
%cast = tensor.cast %12 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64> | |
%13 = tensor.empty() : tensor<2x32x1x1xf64> | |
%14 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast : tensor<2x32x1x1xf64>) outs(%13 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%32 = arith.sitofp %c163840_i64 : i64 to f64 | |
%33 = arith.divf %in, %32 : f64 | |
linalg.yield %33 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %14 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%8 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_10: f64, %out: f64): | |
%32 = arith.mulf %in_10, %3 : f64 | |
%33 = arith.subf %in, %32 : f64 | |
linalg.yield %33 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%16 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15, %15 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%8 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_10: f64, %out: f64): | |
%32 = arith.mulf %in, %in_10 : f64 | |
linalg.yield %32 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%c2_3 = arith.constant 2 : index | |
%c32_4 = arith.constant 32 : index | |
%17 = tensor.empty(%c2_3, %c32_4) : tensor<?x?x1x1xf64> | |
%18 = linalg.fill ins(%cst_1 : f64) outs(%17 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64> | |
%19 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%16 : tensor<2x32x10x16384xf64>) outs(%18 : tensor<?x?x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%32 = arith.addf %in, %out : f64 | |
linalg.yield %32 : f64 | |
} -> tensor<?x?x1x1xf64> | |
%cast_5 = tensor.cast %19 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64> | |
%20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_5 : tensor<2x32x1x1xf64>) outs(%13 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%32 = arith.sitofp %c163840_i64 : i64 to f64 | |
%33 = arith.divf %in, %32 : f64 | |
linalg.yield %33 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%21 = tensor.empty() : tensor<2x32x1x1xf32> | |
%22 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%20 : tensor<2x32x1x1xf64>) outs(%21 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f64, %out: f32): | |
%32 = arith.truncf %in : f64 to f32 | |
linalg.yield %32 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%cst_6 = arith.constant 0.000000e+00 : f32 | |
%c2_7 = arith.constant 2 : index | |
%c32_8 = arith.constant 32 : index | |
%23 = tensor.empty(%c2_7, %c32_8) : tensor<?x?x1x1xf32> | |
%24 = linalg.fill ins(%cst_6 : f32) outs(%23 : tensor<?x?x1x1xf32>) -> tensor<?x?x1x1xf32> | |
%25 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%24 : tensor<?x?x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%32 = arith.addf %in, %out : f32 | |
linalg.yield %32 : f32 | |
} -> tensor<?x?x1x1xf32> | |
%cast_9 = tensor.cast %25 : tensor<?x?x1x1xf32> to tensor<2x32x1x1xf32> | |
%26 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_9 : tensor<2x32x1x1xf32>) outs(%21 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%32 = arith.sitofp %c163840_i64 : i64 to f32 | |
%33 = arith.divf %in, %32 : f32 | |
linalg.yield %33 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%27 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22 : tensor<2x32x1x1xf32>) outs(%21 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%32 = arith.truncf %5 : f64 to f32 | |
%33 = arith.sitofp %c1_i64 : i64 to f32 | |
%34 = arith.mulf %32, %33 : f32 | |
%35 = arith.addf %in, %34 : f32 | |
linalg.yield %35 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%28 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%27 : tensor<2x32x1x1xf32>) outs(%21 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%32 = math.rsqrt %in : f32 | |
linalg.yield %32 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%29 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %26 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_10: f32, %out: f32): | |
%32 = arith.extf %in : f16 to f32 | |
%33 = arith.sitofp %c1_i64 : i64 to f32 | |
%34 = arith.mulf %in_10, %33 : f32 | |
%35 = arith.subf %32, %34 : f32 | |
linalg.yield %35 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%30 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %28 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_10: f32, %out: f32): | |
%32 = arith.mulf %in, %in_10 : f32 | |
linalg.yield %32 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%31 = torch_c.from_builtin_tensor %30 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %31 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
} | |
// -----// IR Dump After ExpandOps (memref-expand) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%cst = arith.constant 1.000000e+00 : f64 | |
%2 = torch_c.from_f64 %cst | |
%3 = torch_c.to_f64 %2 | |
%c163840_i64 = arith.constant 163840 : i64 | |
%c1_i64 = arith.constant 1 : i64 | |
%cst_0 = arith.constant 1.000000e-05 : f64 | |
%4 = torch_c.from_f64 %cst_0 | |
%5 = torch_c.to_f64 %4 | |
%c3_i64 = arith.constant 3 : i64 | |
%c2_i64 = arith.constant 2 : i64 | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%32 = arith.extf %in : f16 to f32 | |
linalg.yield %32 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%8 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%8 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f32, %out: f64): | |
%32 = arith.extf %in : f32 to f64 | |
linalg.yield %32 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%cst_1 = arith.constant 0.000000e+00 : f64 | |
%c0 = arith.constant 0 : index | |
%c2 = arith.constant 2 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c2_2 = arith.constant 2 : index | |
%c3 = arith.constant 3 : index | |
%10 = tensor.empty(%c2, %c32) : tensor<?x?x1x1xf64> | |
%11 = linalg.fill ins(%cst_1 : f64) outs(%10 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf64>) outs(%11 : tensor<?x?x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%32 = arith.addf %in, %out : f64 | |
linalg.yield %32 : f64 | |
} -> tensor<?x?x1x1xf64> | |
%cast = tensor.cast %12 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64> | |
%13 = tensor.empty() : tensor<2x32x1x1xf64> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast : tensor<2x32x1x1xf64>) outs(%13 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%32 = arith.sitofp %c163840_i64 : i64 to f64 | |
%33 = arith.divf %in, %32 : f64 | |
linalg.yield %33 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %14 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%8 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_10: f64, %out: f64): | |
%32 = arith.mulf %in_10, %3 : f64 | |
%33 = arith.subf %in, %32 : f64 | |
linalg.yield %33 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15, %15 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%8 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_10: f64, %out: f64): | |
%32 = arith.mulf %in, %in_10 : f64 | |
linalg.yield %32 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%c2_3 = arith.constant 2 : index | |
%c32_4 = arith.constant 32 : index | |
%17 = tensor.empty(%c2_3, %c32_4) : tensor<?x?x1x1xf64> | |
%18 = linalg.fill ins(%cst_1 : f64) outs(%17 : tensor<?x?x1x1xf64>) -> tensor<?x?x1x1xf64> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%16 : tensor<2x32x10x16384xf64>) outs(%18 : tensor<?x?x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%32 = arith.addf %in, %out : f64 | |
linalg.yield %32 : f64 | |
} -> tensor<?x?x1x1xf64> | |
%cast_5 = tensor.cast %19 : tensor<?x?x1x1xf64> to tensor<2x32x1x1xf64> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_5 : tensor<2x32x1x1xf64>) outs(%13 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%32 = arith.sitofp %c163840_i64 : i64 to f64 | |
%33 = arith.divf %in, %32 : f64 | |
linalg.yield %33 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%21 = tensor.empty() : tensor<2x32x1x1xf32> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%20 : tensor<2x32x1x1xf64>) outs(%21 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f64, %out: f32): | |
%32 = arith.truncf %in : f64 to f32 | |
linalg.yield %32 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%cst_6 = arith.constant 0.000000e+00 : f32 | |
%c2_7 = arith.constant 2 : index | |
%c32_8 = arith.constant 32 : index | |
%23 = tensor.empty(%c2_7, %c32_8) : tensor<?x?x1x1xf32> | |
%24 = linalg.fill ins(%cst_6 : f32) outs(%23 : tensor<?x?x1x1xf32>) -> tensor<?x?x1x1xf32> | |
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%24 : tensor<?x?x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%32 = arith.addf %in, %out : f32 | |
linalg.yield %32 : f32 | |
} -> tensor<?x?x1x1xf32> | |
%cast_9 = tensor.cast %25 : tensor<?x?x1x1xf32> to tensor<2x32x1x1xf32> | |
%26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cast_9 : tensor<2x32x1x1xf32>) outs(%21 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%32 = arith.sitofp %c163840_i64 : i64 to f32 | |
%33 = arith.divf %in, %32 : f32 | |
linalg.yield %33 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22 : tensor<2x32x1x1xf32>) outs(%21 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%32 = arith.truncf %5 : f64 to f32 | |
%33 = arith.sitofp %c1_i64 : i64 to f32 | |
%34 = arith.mulf %32, %33 : f32 | |
%35 = arith.addf %in, %34 : f32 | |
linalg.yield %35 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%27 : tensor<2x32x1x1xf32>) outs(%21 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%32 = math.rsqrt %in : f32 | |
linalg.yield %32 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %26 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_10: f32, %out: f32): | |
%32 = arith.extf %in : f16 to f32 | |
%33 = arith.sitofp %c1_i64 : i64 to f32 | |
%34 = arith.mulf %in_10, %33 : f32 | |
%35 = arith.subf %32, %34 : f32 | |
linalg.yield %35 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %28 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_10: f32, %out: f32): | |
%32 = arith.mulf %in, %in_10 : f32 | |
linalg.yield %32 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%31 = torch_c.from_builtin_tensor %30 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %31 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%cst = arith.constant 1.638400e+05 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f64 | |
%cst_1 = arith.constant 1.000000e-05 : f64 | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%cst_3 = arith.constant 0.000000e+00 : f64 | |
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%28 = arith.extf %in : f16 to f32 | |
linalg.yield %28 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f32, %out: f64): | |
%28 = arith.extf %in : f32 to f64 | |
linalg.yield %28 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%6 = tensor.empty() : tensor<2x32x1x1xf64> | |
%7 = linalg.fill ins(%cst_3 : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%28 = arith.addf %in, %out : f64 | |
linalg.yield %28 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%9 = tensor.empty() : tensor<2x32x1x1xf64> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%9 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%28 = arith.divf %in, %cst_0 : f64 | |
linalg.yield %28 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%28 = arith.subf %in, %in_4 : f64 | |
linalg.yield %28 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11, %11 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%28 = arith.mulf %in, %in_4 : f64 | |
linalg.yield %28 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%13 = tensor.empty() : tensor<2x32x1x1xf64> | |
%14 = linalg.fill ins(%cst_3 : f64) outs(%13 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%12 : tensor<2x32x10x16384xf64>) outs(%14 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%28 = arith.addf %in, %out : f64 | |
linalg.yield %28 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf64>) outs(%9 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%28 = arith.divf %in, %cst_0 : f64 | |
linalg.yield %28 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%17 = tensor.empty() : tensor<2x32x1x1xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<2x32x1x1xf64>) outs(%17 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f64, %out: f32): | |
%28 = arith.truncf %in : f64 to f32 | |
linalg.yield %28 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%19 = tensor.empty() : tensor<2x32x1x1xf32> | |
%20 = linalg.fill ins(%cst_2 : f32) outs(%19 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%20 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%28 = arith.addf %in, %out : f32 | |
linalg.yield %28 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21 : tensor<2x32x1x1xf32>) outs(%17 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%28 = arith.divf %in, %cst : f32 | |
linalg.yield %28 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18 : tensor<2x32x1x1xf32>) outs(%17 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%28 = arith.truncf %cst_1 : f64 to f32 | |
%29 = arith.addf %in, %28 : f32 | |
linalg.yield %29 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<2x32x1x1xf32>) outs(%17 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%28 = math.rsqrt %in : f32 | |
linalg.yield %28 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %22 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_4: f32, %out: f32): | |
%28 = arith.extf %in : f16 to f32 | |
%29 = arith.subf %28, %in_4 : f32 | |
linalg.yield %29 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%25, %24 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%28 = arith.mulf %in, %in_4 : f32 | |
linalg.yield %28 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%27 = torch_c.from_builtin_tensor %26 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %27 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%cst = arith.constant 1.638400e+05 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f64 | |
%cst_1 = arith.constant 1.000000e-05 : f64 | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%cst_3 = arith.constant 0.000000e+00 : f64 | |
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%28 = arith.extf %in : f16 to f32 | |
linalg.yield %28 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f32, %out: f64): | |
%28 = arith.extf %in : f32 to f64 | |
linalg.yield %28 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%6 = tensor.empty() : tensor<2x32x1x1xf64> | |
%7 = linalg.fill ins(%cst_3 : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%28 = arith.addf %in, %out : f64 | |
linalg.yield %28 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%9 = tensor.empty() : tensor<2x32x1x1xf64> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%9 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%28 = arith.divf %in, %cst_0 : f64 | |
linalg.yield %28 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%28 = arith.subf %in, %in_4 : f64 | |
linalg.yield %28 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11, %11 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%28 = arith.mulf %in, %in_4 : f64 | |
linalg.yield %28 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%13 = tensor.empty() : tensor<2x32x1x1xf64> | |
%14 = linalg.fill ins(%cst_3 : f64) outs(%13 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%12 : tensor<2x32x10x16384xf64>) outs(%14 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%28 = arith.addf %in, %out : f64 | |
linalg.yield %28 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf64>) outs(%9 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%28 = arith.divf %in, %cst_0 : f64 | |
linalg.yield %28 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%17 = tensor.empty() : tensor<2x32x1x1xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<2x32x1x1xf64>) outs(%17 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f64, %out: f32): | |
%28 = arith.truncf %in : f64 to f32 | |
linalg.yield %28 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%19 = tensor.empty() : tensor<2x32x1x1xf32> | |
%20 = linalg.fill ins(%cst_2 : f32) outs(%19 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%20 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%28 = arith.addf %in, %out : f32 | |
linalg.yield %28 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21 : tensor<2x32x1x1xf32>) outs(%17 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%28 = arith.divf %in, %cst : f32 | |
linalg.yield %28 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18 : tensor<2x32x1x1xf32>) outs(%17 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%28 = arith.truncf %cst_1 : f64 to f32 | |
%29 = arith.addf %in, %28 : f32 | |
linalg.yield %29 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<2x32x1x1xf32>) outs(%17 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%28 = math.rsqrt %in : f32 | |
linalg.yield %28 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %22 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_4: f32, %out: f32): | |
%28 = arith.extf %in : f16 to f32 | |
%29 = arith.subf %28, %in_4 : f32 | |
linalg.yield %29 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%25, %24 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%28 = arith.mulf %in, %in_4 : f32 | |
linalg.yield %28 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%27 = torch_c.from_builtin_tensor %26 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %27 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%cst = arith.constant 1.638400e+05 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f64 | |
%cst_1 = arith.constant 1.000000e-05 : f64 | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%cst_3 = arith.constant 0.000000e+00 : f64 | |
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%24 = arith.extf %in : f16 to f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f32, %out: f64): | |
%24 = arith.extf %in : f32 to f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%6 = tensor.empty() : tensor<2x32x1x1xf64> | |
%7 = linalg.fill ins(%cst_3 : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%24 = arith.addf %in, %out : f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%24 = arith.divf %in, %cst_0 : f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%24 = arith.subf %in, %in_4 : f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%24 = arith.mulf %in, %in_4 : f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%24 = arith.addf %in, %out : f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%24 = arith.divf %in, %cst_0 : f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%14 = tensor.empty() : tensor<2x32x1x1xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf64>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f64, %out: f32): | |
%24 = arith.truncf %in : f64 to f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%16 = linalg.fill ins(%cst_2 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = arith.addf %in, %out : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = arith.divf %in, %cst : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = arith.truncf %cst_1 : f64 to f32 | |
%25 = arith.addf %in, %24 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = math.rsqrt %in : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_4: f32, %out: f32): | |
%24 = arith.extf %in : f16 to f32 | |
%25 = arith.subf %24, %in_4 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%24 = arith.mulf %in, %in_4 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%23 = torch_c.from_builtin_tensor %22 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %23 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%cst = arith.constant 1.638400e+05 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f64 | |
%cst_1 = arith.constant 1.000000e-05 : f64 | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%cst_3 = arith.constant 0.000000e+00 : f64 | |
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%24 = arith.extf %in : f16 to f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f32, %out: f64): | |
%24 = arith.extf %in : f32 to f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%6 = tensor.empty() : tensor<2x32x1x1xf64> | |
%7 = linalg.fill ins(%cst_3 : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%24 = arith.addf %in, %out : f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%24 = arith.divf %in, %cst_0 : f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%24 = arith.subf %in, %in_4 : f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%24 = arith.mulf %in, %in_4 : f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%24 = arith.addf %in, %out : f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%24 = arith.divf %in, %cst_0 : f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%14 = tensor.empty() : tensor<2x32x1x1xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf64>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f64, %out: f32): | |
%24 = arith.truncf %in : f64 to f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%16 = linalg.fill ins(%cst_2 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = arith.addf %in, %out : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = arith.divf %in, %cst : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = arith.truncf %cst_1 : f64 to f32 | |
%25 = arith.addf %in, %24 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = math.rsqrt %in : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_4: f32, %out: f32): | |
%24 = arith.extf %in : f16 to f32 | |
%25 = arith.subf %24, %in_4 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%24 = arith.mulf %in, %in_4 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%23 = torch_c.from_builtin_tensor %22 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %23 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
// -----// IR Dump After Inliner (inline) //----- // | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)> | |
module { | |
func.func @main(%arg0: !torch.vtensor<[2,32,10,16384],f16>, %arg1: !torch.vtensor<[2,32,10,16384],f16>) -> !torch.vtensor<[2,32,10,16384],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%cst = arith.constant 1.638400e+05 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f64 | |
%cst_1 = arith.constant 1.000000e-05 : f64 | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%cst_3 = arith.constant 0.000000e+00 : f64 | |
%0 = torch_c.to_builtin_tensor %arg1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%1 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%24 = arith.extf %in : f16 to f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f32, %out: f64): | |
%24 = arith.extf %in : f32 to f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%6 = tensor.empty() : tensor<2x32x1x1xf64> | |
%7 = linalg.fill ins(%cst_3 : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64> | |
%8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%24 = arith.addf %in, %out : f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%24 = arith.divf %in, %cst_0 : f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%24 = arith.subf %in, %in_4 : f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%24 = arith.mulf %in, %in_4 : f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%24 = arith.addf %in, %out : f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%24 = arith.divf %in, %cst_0 : f64 | |
linalg.yield %24 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%14 = tensor.empty() : tensor<2x32x1x1xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf64>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f64, %out: f32): | |
%24 = arith.truncf %in : f64 to f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%16 = linalg.fill ins(%cst_2 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%17 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = arith.addf %in, %out : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%18 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = arith.divf %in, %cst : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = arith.truncf %cst_1 : f64 to f32 | |
%25 = arith.addf %in, %24 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = math.rsqrt %in : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_4: f32, %out: f32): | |
%24 = arith.extf %in : f16 to f32 | |
%25 = arith.subf %24, %in_4 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%22 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%24 = arith.mulf %in, %in_4 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%23 = torch_c.from_builtin_tensor %22 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32> | |
return %23 : !torch.vtensor<[2,32,10,16384],f32> | |
} | |
} | |
// -----// IR Dump After FuncConversionPass (torch-iree-func-conversion) //----- // | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)> | |
module { | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = torch_c.from_builtin_tensor %0 : tensor<2x32x10x16384xf16> -> !torch.vtensor<[2,32,10,16384],f16> | |
%2 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%3 = torch_c.from_builtin_tensor %2 : tensor<2x32x10x16384xf16> -> !torch.vtensor<[2,32,10,16384],f16> | |
%cst = arith.constant 1.638400e+05 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f64 | |
%cst_1 = arith.constant 1.000000e-05 : f64 | |
%cst_2 = arith.constant 0.000000e+00 : f32 | |
%cst_3 = arith.constant 0.000000e+00 : f64 | |
%4 = torch_c.to_builtin_tensor %3 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%5 = torch_c.to_builtin_tensor %1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%30 = arith.extf %in : f16 to f32 | |
linalg.yield %30 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%8 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%8 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f32, %out: f64): | |
%30 = arith.extf %in : f32 to f64 | |
linalg.yield %30 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%10 = tensor.empty() : tensor<2x32x1x1xf64> | |
%11 = linalg.fill ins(%cst_3 : f64) outs(%10 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64> | |
%12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf64>) outs(%11 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%30 = arith.addf %in, %out : f64 | |
linalg.yield %30 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%10 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%30 = arith.divf %in, %cst_0 : f64 | |
linalg.yield %30 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %13 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%8 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%30 = arith.subf %in, %in_4 : f64 | |
linalg.yield %30 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%15 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %14 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%8 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%30 = arith.mulf %in, %in_4 : f64 | |
linalg.yield %30 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%16 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%15 : tensor<2x32x10x16384xf64>) outs(%11 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%30 = arith.addf %in, %out : f64 | |
linalg.yield %30 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%17 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<2x32x1x1xf64>) outs(%10 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%30 = arith.divf %in, %cst_0 : f64 | |
linalg.yield %30 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%18 = tensor.empty() : tensor<2x32x1x1xf32> | |
%19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf64>) outs(%18 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f64, %out: f32): | |
%30 = arith.truncf %in : f64 to f32 | |
linalg.yield %30 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%20 = linalg.fill ins(%cst_2 : f32) outs(%18 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%21 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%20 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%30 = arith.addf %in, %out : f32 | |
linalg.yield %30 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%22 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21 : tensor<2x32x1x1xf32>) outs(%18 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%30 = arith.divf %in, %cst : f32 | |
linalg.yield %30 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%23 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%18 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%30 = arith.truncf %cst_1 : f64 to f32 | |
%31 = arith.addf %in, %30 : f32 | |
linalg.yield %31 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%24 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<2x32x1x1xf32>) outs(%18 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%30 = math.rsqrt %in : f32 | |
linalg.yield %30 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%25 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %22 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_4: f32, %out: f32): | |
%30 = arith.extf %in : f16 to f32 | |
%31 = arith.subf %30, %in_4 : f32 | |
linalg.yield %31 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%26 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%25, %24 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%30 = arith.mulf %in, %in_4 : f32 | |
linalg.yield %30 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%27 = torch_c.from_builtin_tensor %26 : tensor<2x32x10x16384xf32> -> !torch.vtensor<[2,32,10,16384],f32> | |
%28 = hal.tensor.barrier join(%26 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%29 = hal.tensor.export %28 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %29 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%0 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f64 | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant 1.000000e-05 : f64 | |
%cst_2 = arith.constant 1.638400e+05 : f64 | |
%cst_3 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = torch_c.from_builtin_tensor %0 : tensor<2x32x10x16384xf16> -> !torch.vtensor<[2,32,10,16384],f16> | |
%2 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%3 = torch_c.from_builtin_tensor %2 : tensor<2x32x10x16384xf16> -> !torch.vtensor<[2,32,10,16384],f16> | |
%4 = torch_c.to_builtin_tensor %3 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%5 = torch_c.to_builtin_tensor %1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%29 = arith.extf %in : f16 to f32 | |
linalg.yield %29 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%8 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%8 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f32, %out: f64): | |
%29 = arith.extf %in : f32 to f64 | |
linalg.yield %29 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%10 = tensor.empty() : tensor<2x32x1x1xf64> | |
%11 = linalg.fill ins(%cst : f64) outs(%10 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf64>) outs(%11 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%29 = arith.addf %in, %out : f64 | |
linalg.yield %29 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%10 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%29 = arith.divf %in, %cst_2 : f64 | |
linalg.yield %29 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %13 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%8 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%29 = arith.subf %in, %in_4 : f64 | |
linalg.yield %29 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %14 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%8 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%29 = arith.mulf %in, %in_4 : f64 | |
linalg.yield %29 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%15 : tensor<2x32x10x16384xf64>) outs(%11 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%29 = arith.addf %in, %out : f64 | |
linalg.yield %29 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<2x32x1x1xf64>) outs(%10 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%29 = arith.divf %in, %cst_2 : f64 | |
linalg.yield %29 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%18 = tensor.empty() : tensor<2x32x1x1xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf64>) outs(%18 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f64, %out: f32): | |
%29 = arith.truncf %in : f64 to f32 | |
linalg.yield %29 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%20 = linalg.fill ins(%cst_0 : f32) outs(%18 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%20 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%29 = arith.addf %in, %out : f32 | |
linalg.yield %29 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21 : tensor<2x32x1x1xf32>) outs(%18 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%29 = arith.divf %in, %cst_3 : f32 | |
linalg.yield %29 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%18 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%29 = arith.truncf %cst_1 : f64 to f32 | |
%30 = arith.addf %in, %29 : f32 | |
linalg.yield %30 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<2x32x1x1xf32>) outs(%18 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%29 = math.rsqrt %in : f32 | |
linalg.yield %29 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %22 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_4: f32, %out: f32): | |
%29 = arith.extf %in : f16 to f32 | |
%30 = arith.subf %29, %in_4 : f32 | |
linalg.yield %30 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%25, %24 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%29 = arith.mulf %in, %in_4 : f32 | |
linalg.yield %29 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%27 = hal.tensor.barrier join(%26 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%28 = hal.tensor.export %27 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %28 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)> | |
module { | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f64 | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant 1.000000e-05 : f64 | |
%cst_2 = arith.constant 1.638400e+05 : f64 | |
%cst_3 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = torch_c.from_builtin_tensor %0 : tensor<2x32x10x16384xf16> -> !torch.vtensor<[2,32,10,16384],f16> | |
%2 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%3 = torch_c.from_builtin_tensor %2 : tensor<2x32x10x16384xf16> -> !torch.vtensor<[2,32,10,16384],f16> | |
%4 = torch_c.to_builtin_tensor %3 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%5 = torch_c.to_builtin_tensor %1 : !torch.vtensor<[2,32,10,16384],f16> -> tensor<2x32x10x16384xf16> | |
%6 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%29 = arith.extf %in : f16 to f32 | |
linalg.yield %29 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%8 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%8 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f32, %out: f64): | |
%29 = arith.extf %in : f32 to f64 | |
linalg.yield %29 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%10 = tensor.empty() : tensor<2x32x1x1xf64> | |
%11 = linalg.fill ins(%cst : f64) outs(%10 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64> | |
%12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf64>) outs(%11 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%29 = arith.addf %in, %out : f64 | |
linalg.yield %29 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%10 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%29 = arith.divf %in, %cst_2 : f64 | |
linalg.yield %29 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %13 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%8 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%29 = arith.subf %in, %in_4 : f64 | |
linalg.yield %29 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%15 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %14 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%8 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%29 = arith.mulf %in, %in_4 : f64 | |
linalg.yield %29 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%16 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%15 : tensor<2x32x10x16384xf64>) outs(%11 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%29 = arith.addf %in, %out : f64 | |
linalg.yield %29 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%17 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<2x32x1x1xf64>) outs(%10 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%29 = arith.divf %in, %cst_2 : f64 | |
linalg.yield %29 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%18 = tensor.empty() : tensor<2x32x1x1xf32> | |
%19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf64>) outs(%18 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f64, %out: f32): | |
%29 = arith.truncf %in : f64 to f32 | |
linalg.yield %29 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%20 = linalg.fill ins(%cst_0 : f32) outs(%18 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%21 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%20 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%29 = arith.addf %in, %out : f32 | |
linalg.yield %29 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%22 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21 : tensor<2x32x1x1xf32>) outs(%18 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%29 = arith.divf %in, %cst_3 : f32 | |
linalg.yield %29 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%23 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%18 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%29 = arith.truncf %cst_1 : f64 to f32 | |
%30 = arith.addf %in, %29 : f32 | |
linalg.yield %30 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%24 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<2x32x1x1xf32>) outs(%18 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%29 = math.rsqrt %in : f32 | |
linalg.yield %29 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%25 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %22 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_4: f32, %out: f32): | |
%29 = arith.extf %in : f16 to f32 | |
%30 = arith.subf %29, %in_4 : f32 | |
linalg.yield %30 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%26 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%25, %24 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%29 = arith.mulf %in, %in_4 : f32 | |
linalg.yield %29 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%27 = hal.tensor.barrier join(%26 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%28 = hal.tensor.export %27 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %28 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FinalizingBackendTypeConversion (torch-finalizing-backend-type-conversion) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f64 | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant 1.000000e-05 : f64 | |
%cst_2 = arith.constant 1.638400e+05 : f64 | |
%cst_3 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%25 = arith.extf %in : f16 to f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f32, %out: f64): | |
%25 = arith.extf %in : f32 to f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%6 = tensor.empty() : tensor<2x32x1x1xf64> | |
%7 = linalg.fill ins(%cst : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.addf %in, %out : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.divf %in, %cst_2 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%25 = arith.subf %in, %in_4 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%25 = arith.mulf %in, %in_4 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.addf %in, %out : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.divf %in, %cst_2 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%14 = tensor.empty() : tensor<2x32x1x1xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf64>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f64, %out: f32): | |
%25 = arith.truncf %in : f64 to f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%16 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.addf %in, %out : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.divf %in, %cst_3 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.truncf %cst_1 : f64 to f32 | |
%26 = arith.addf %in, %25 : f32 | |
linalg.yield %26 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = math.rsqrt %in : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_4: f32, %out: f32): | |
%25 = arith.extf %in : f16 to f32 | |
%26 = arith.subf %25, %in_4 : f32 | |
linalg.yield %26 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%25 = arith.mulf %in, %in_4 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%23 = hal.tensor.barrier join(%22 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%24 = hal.tensor.export %23 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %24 : !hal.buffer_view | |
} | |
// -----// IR Dump After FinalizingBackendTypeConversion (torch-finalizing-backend-type-conversion) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- // | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)> | |
module { | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f64 | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant 1.000000e-05 : f64 | |
%cst_2 = arith.constant 1.638400e+05 : f64 | |
%cst_3 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%25 = arith.extf %in : f16 to f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f32, %out: f64): | |
%25 = arith.extf %in : f32 to f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%6 = tensor.empty() : tensor<2x32x1x1xf64> | |
%7 = linalg.fill ins(%cst : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64> | |
%8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.addf %in, %out : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.divf %in, %cst_2 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%25 = arith.subf %in, %in_4 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%25 = arith.mulf %in, %in_4 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.addf %in, %out : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.divf %in, %cst_2 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%14 = tensor.empty() : tensor<2x32x1x1xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf64>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f64, %out: f32): | |
%25 = arith.truncf %in : f64 to f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%16 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%17 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.addf %in, %out : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%18 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.divf %in, %cst_3 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.truncf %cst_1 : f64 to f32 | |
%26 = arith.addf %in, %25 : f32 | |
linalg.yield %26 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = math.rsqrt %in : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_4: f32, %out: f32): | |
%25 = arith.extf %in : f16 to f32 | |
%26 = arith.subf %25, %in_4 : f32 | |
linalg.yield %26 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%22 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%25 = arith.mulf %in, %in_4 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%23 = hal.tensor.barrier join(%22 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%24 = hal.tensor.export %23 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %24 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- // | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)> | |
module { | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f64 | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant 1.000000e-05 : f64 | |
%cst_2 = arith.constant 1.638400e+05 : f64 | |
%cst_3 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%25 = arith.extf %in : f16 to f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f32, %out: f64): | |
%25 = arith.extf %in : f32 to f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%6 = tensor.empty() : tensor<2x32x1x1xf64> | |
%7 = linalg.fill ins(%cst : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64> | |
%8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.addf %in, %out : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.divf %in, %cst_2 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%25 = arith.subf %in, %in_4 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%25 = arith.mulf %in, %in_4 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.addf %in, %out : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.divf %in, %cst_2 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%14 = tensor.empty() : tensor<2x32x1x1xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf64>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f64, %out: f32): | |
%25 = arith.truncf %in : f64 to f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%16 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%17 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.addf %in, %out : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%18 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.divf %in, %cst_3 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.truncf %cst_1 : f64 to f32 | |
%26 = arith.addf %in, %25 : f32 | |
linalg.yield %26 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = math.rsqrt %in : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_4: f32, %out: f32): | |
%25 = arith.extf %in : f16 to f32 | |
%26 = arith.subf %25, %in_4 : f32 | |
linalg.yield %26 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%22 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%25 = arith.mulf %in, %in_4 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%23 = hal.tensor.barrier join(%22 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%24 = hal.tensor.export %23 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %24 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ImportMLProgramPass (iree-import-ml-program) //----- // | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)> | |
module { | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f64 | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant 1.000000e-05 : f64 | |
%cst_2 = arith.constant 1.638400e+05 : f64 | |
%cst_3 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%25 = arith.extf %in : f16 to f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f32, %out: f64): | |
%25 = arith.extf %in : f32 to f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%6 = tensor.empty() : tensor<2x32x1x1xf64> | |
%7 = linalg.fill ins(%cst : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64> | |
%8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.addf %in, %out : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.divf %in, %cst_2 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%25 = arith.subf %in, %in_4 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%25 = arith.mulf %in, %in_4 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.addf %in, %out : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.divf %in, %cst_2 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%14 = tensor.empty() : tensor<2x32x1x1xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf64>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f64, %out: f32): | |
%25 = arith.truncf %in : f64 to f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%16 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%17 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.addf %in, %out : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%18 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.divf %in, %cst_3 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.truncf %cst_1 : f64 to f32 | |
%26 = arith.addf %in, %25 : f32 | |
linalg.yield %26 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = math.rsqrt %in : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_4: f32, %out: f32): | |
%25 = arith.extf %in : f16 to f32 | |
%26 = arith.subf %25, %in_4 : f32 | |
linalg.yield %26 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%22 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%25 = arith.mulf %in, %in_4 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%23 = hal.tensor.barrier join(%22 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%24 = hal.tensor.export %23 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %24 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After SanitizeModuleNamesPass (iree-sanitize-module-names) //----- // | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)> | |
module { | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f64 | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant 1.000000e-05 : f64 | |
%cst_2 = arith.constant 1.638400e+05 : f64 | |
%cst_3 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%25 = arith.extf %in : f16 to f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f32, %out: f64): | |
%25 = arith.extf %in : f32 to f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%6 = tensor.empty() : tensor<2x32x1x1xf64> | |
%7 = linalg.fill ins(%cst : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64> | |
%8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.addf %in, %out : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.divf %in, %cst_2 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%25 = arith.subf %in, %in_4 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%25 = arith.mulf %in, %in_4 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.addf %in, %out : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.divf %in, %cst_2 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%14 = tensor.empty() : tensor<2x32x1x1xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf64>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f64, %out: f32): | |
%25 = arith.truncf %in : f64 to f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%16 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%17 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.addf %in, %out : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%18 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.divf %in, %cst_3 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.truncf %cst_1 : f64 to f32 | |
%26 = arith.addf %in, %25 : f32 | |
linalg.yield %26 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = math.rsqrt %in : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_4: f32, %out: f32): | |
%25 = arith.extf %in : f16 to f32 | |
%26 = arith.subf %25, %in_4 : f32 | |
linalg.yield %26 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%22 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%25 = arith.mulf %in, %in_4 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%23 = hal.tensor.barrier join(%22 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%24 = hal.tensor.export %23 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %24 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- // | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)> | |
module { | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f64 | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant 1.000000e-05 : f64 | |
%cst_2 = arith.constant 1.638400e+05 : f64 | |
%cst_3 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%25 = arith.extf %in : f16 to f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x10x16384xf64> | |
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f32, %out: f64): | |
%25 = arith.extf %in : f32 to f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%6 = tensor.empty() : tensor<2x32x1x1xf64> | |
%7 = linalg.fill ins(%cst : f64) outs(%6 : tensor<2x32x1x1xf64>) -> tensor<2x32x1x1xf64> | |
%8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.addf %in, %out : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.divf %in, %cst_2 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf64>, tensor<2x32x1x1xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%25 = arith.subf %in, %in_4 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf64>, tensor<2x32x10x16384xf64>) outs(%4 : tensor<2x32x10x16384xf64>) { | |
^bb0(%in: f64, %in_4: f64, %out: f64): | |
%25 = arith.mulf %in, %in_4 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x10x16384xf64> | |
%12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf64>) outs(%7 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.addf %in, %out : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf64>) outs(%6 : tensor<2x32x1x1xf64>) { | |
^bb0(%in: f64, %out: f64): | |
%25 = arith.divf %in, %cst_2 : f64 | |
linalg.yield %25 : f64 | |
} -> tensor<2x32x1x1xf64> | |
%14 = tensor.empty() : tensor<2x32x1x1xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf64>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f64, %out: f32): | |
%25 = arith.truncf %in : f64 to f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%16 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%17 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.addf %in, %out : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%18 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.divf %in, %cst_3 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.truncf %cst_1 : f64 to f32 | |
%26 = arith.addf %in, %25 : f32 | |
linalg.yield %26 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = math.rsqrt %in : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_4: f32, %out: f32): | |
%25 = arith.extf %in : f16 to f32 | |
%26 = arith.subf %25, %in_4 : f32 | |
linalg.yield %26 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%22 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%25 = arith.mulf %in, %in_4 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%23 = hal.tensor.barrier join(%22 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%24 = hal.tensor.export %23 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %24 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- // | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)> | |
module { | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant 9.99999974E-6 : f32 | |
%cst_2 = arith.constant 1.638400e+05 : f32 | |
%cst_3 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%25 = arith.extf %in : f16 to f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%6 = tensor.empty() : tensor<2x32x1x1xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf32>) outs(%7 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.addf %in, %out : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%25 = arith.subf %in, %in_4 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%25 = arith.mulf %in, %in_4 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf32>) outs(%7 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.addf %in, %out : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%14 = tensor.empty() : tensor<2x32x1x1xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<2x32x1x1xf32> | |
%16 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%17 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.addf %in, %out : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%18 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.divf %in, %cst_3 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.addf %in, %cst_1 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = math.rsqrt %in : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_4: f32, %out: f32): | |
%25 = arith.extf %in : f16 to f32 | |
%26 = arith.subf %25, %in_4 : f32 | |
linalg.yield %26 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%22 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%25 = arith.mulf %in, %in_4 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%23 = hal.tensor.barrier join(%22 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%24 = hal.tensor.export %23 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %24 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- // | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)> | |
module { | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant 9.99999974E-6 : f32 | |
%cst_2 = arith.constant 1.638400e+05 : f32 | |
%cst_3 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%25 = arith.extf %in : f16 to f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%6 = tensor.empty() : tensor<2x32x1x1xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf32>) outs(%7 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.addf %in, %out : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%25 = arith.subf %in, %in_4 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%25 = arith.mulf %in, %in_4 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf32>) outs(%7 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.addf %in, %out : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%14 = tensor.empty() : tensor<2x32x1x1xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<2x32x1x1xf32> | |
%16 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%17 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.addf %in, %out : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%18 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.divf %in, %cst_3 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.addf %in, %cst_1 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = math.rsqrt %in : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_4: f32, %out: f32): | |
%25 = arith.extf %in : f16 to f32 | |
%26 = arith.subf %25, %in_4 : f32 | |
linalg.yield %26 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%22 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%25 = arith.mulf %in, %in_4 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%23 = hal.tensor.barrier join(%22 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%24 = hal.tensor.export %23 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %24 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- // | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)> | |
module { | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%cst_1 = arith.constant 9.99999974E-6 : f32 | |
%cst_2 = arith.constant 1.638400e+05 : f32 | |
%cst_3 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%25 = arith.extf %in : f16 to f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%6 = tensor.empty() : tensor<2x32x1x1xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%5 : tensor<2x32x10x16384xf32>) outs(%7 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.addf %in, %out : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5, %9 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%25 = arith.subf %in, %in_4 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %10 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%25 = arith.mulf %in, %in_4 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%11 : tensor<2x32x10x16384xf32>) outs(%7 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.addf %in, %out : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%6 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.divf %in, %cst_2 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%14 = tensor.empty() : tensor<2x32x1x1xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<2x32x1x1xf32> | |
%16 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%17 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%16 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.addf %in, %out : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%18 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.divf %in, %cst_3 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%19 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = arith.addf %in, %cst_1 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<2x32x1x1xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%25 = math.rsqrt %in : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %18 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_4: f32, %out: f32): | |
%25 = arith.extf %in : f16 to f32 | |
%26 = arith.subf %25, %in_4 : f32 | |
linalg.yield %26 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%22 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %20 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_4: f32, %out: f32): | |
%25 = arith.mulf %in, %in_4 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%23 = hal.tensor.barrier join(%22 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%24 = hal.tensor.export %23 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %24 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%23 = arith.extf %in : f16 to f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%5 = tensor.empty() : tensor<2x32x1x1xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%6 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.addf %in, %out : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x32x1x1xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%23 = arith.subf %in, %in_2 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %9 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%23 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%10 : tensor<2x32x10x16384xf32>) outs(%6 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.addf %in, %out : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%13 = tensor.empty() : tensor<2x32x1x1xf32> | |
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.addf %in, %out : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%13 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%13 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%13 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = math.rsqrt %in : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %16 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%23 = arith.extf %in : f16 to f32 | |
%24 = arith.subf %23, %in_2 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %18 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%23 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%21 = hal.tensor.barrier join(%20 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%22 = hal.tensor.export %21 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %22 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After Inliner (inline) //----- // | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)> | |
module { | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%23 = arith.extf %in : f16 to f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%5 = tensor.empty() : tensor<2x32x1x1xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%7 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%6 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.addf %in, %out : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x32x1x1xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%23 = arith.subf %in, %in_2 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %9 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%23 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%11 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%10 : tensor<2x32x10x16384xf32>) outs(%6 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.addf %in, %out : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%13 = tensor.empty() : tensor<2x32x1x1xf32> | |
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.addf %in, %out : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%16 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%13 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%17 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%13 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%18 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%13 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = math.rsqrt %in : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%19 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %16 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%23 = arith.extf %in : f16 to f32 | |
%24 = arith.subf %23, %in_2 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%20 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %18 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%23 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%21 = hal.tensor.barrier join(%20 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%22 = hal.tensor.export %21 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %22 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%23 = arith.extf %in : f16 to f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%5 = tensor.empty() : tensor<2x32x1x1xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%6 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.addf %in, %out : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x32x1x1xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%23 = arith.subf %in, %in_2 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %9 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%4 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%23 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%10 : tensor<2x32x10x16384xf32>) outs(%6 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.addf %in, %out : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%13 = tensor.empty() : tensor<2x32x1x1xf32> | |
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%14 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.addf %in, %out : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%15 : tensor<2x32x1x1xf32>) outs(%13 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%13 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17 : tensor<2x32x1x1xf32>) outs(%13 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%23 = math.rsqrt %in : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %16 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%23 = arith.extf %in : f16 to f32 | |
%24 = arith.subf %23, %in_2 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%19, %18 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%23 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%21 = hal.tensor.barrier join(%20 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%22 = hal.tensor.export %21 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %22 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x1x1xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)> | |
module { | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x1x1xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%11 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {hal.device.targets = [#device_target_hip]} { | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x1x1xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%11 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x1x1xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%11 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x1x1xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%11 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x1x1xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%11 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x1x1xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%11 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x1x1xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x1x1xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x1x1xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x1x1xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After Convert1X1FilterConv2DToMatmulPass (iree-global-opt-convert-1x1-filter-conv2d-to-matmul) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x1x1xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After ConvertConvToChannelsLastPass (iree-preprocessing-convert-conv-to-channels-last) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x1x1xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After ConvertConvFilterToChannelsLastPass (iree-preprocessing-convert-conv-filter-to-channels-last) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32x1x1xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32x1x1xf32>) -> tensor<2x32x1x1xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x1x1xf32>) outs(%4 : tensor<2x32x1x1xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x1x1xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32x1x1xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After Convert1X1FilterConv2DToMatmulPass (iree-global-opt-convert-1x1-filter-conv2d-to-matmul) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After ConvertConvToChannelsLastPass (iree-preprocessing-convert-conv-to-channels-last) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After ConvertConvFilterToChannelsLastPass (iree-preprocessing-convert-conv-filter-to-channels-last) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%24 = arith.extf %in : f16 to f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = arith.addf %in, %out : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %8 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%24 = arith.subf %in, %in_2 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %9 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%24 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%11 = tensor.empty() : tensor<2x32xf32> | |
%12 = linalg.fill ins(%cst : f32) outs(%11 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%10 : tensor<2x32x10x16384xf32>) outs(%12 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = arith.addf %in, %out : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32xf32> | |
%14 = tensor.empty() : tensor<2x32xf32> | |
%15 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<2x32xf32>) outs(%14 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32xf32> | |
%16 = tensor.empty() : tensor<2x32xf32> | |
%17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<2x32xf32>) outs(%16 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32xf32> | |
%18 = tensor.empty() : tensor<2x32xf32> | |
%19 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<2x32xf32>) outs(%18 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = math.rsqrt %in : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32xf32> | |
%20 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%24 = arith.extf %in : f16 to f32 | |
%25 = arith.subf %24, %in_2 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%20, %19 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%24 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%22 = hal.tensor.barrier join(%21 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%23 = hal.tensor.export %22 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %23 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%24 = arith.extf %in : f16 to f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = arith.addf %in, %out : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32xf32> | |
%7 = tensor.empty() : tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%7 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %8 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%24 = arith.subf %in, %in_2 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %9 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%24 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%11 = tensor.empty() : tensor<2x32xf32> | |
%12 = linalg.fill ins(%cst : f32) outs(%11 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%10 : tensor<2x32x10x16384xf32>) outs(%12 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = arith.addf %in, %out : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32xf32> | |
%14 = tensor.empty() : tensor<2x32xf32> | |
%15 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%13 : tensor<2x32xf32>) outs(%14 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32xf32> | |
%16 = tensor.empty() : tensor<2x32xf32> | |
%17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<2x32xf32>) outs(%16 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32xf32> | |
%18 = tensor.empty() : tensor<2x32xf32> | |
%19 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<2x32xf32>) outs(%18 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%24 = math.rsqrt %in : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32xf32> | |
%20 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%24 = arith.extf %in : f16 to f32 | |
%25 = arith.subf %24, %in_2 : f32 | |
linalg.yield %25 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%21 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%20, %19 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%24 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%22 = hal.tensor.barrier join(%21 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%23 = hal.tensor.export %22 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %23 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After PadToIntrinsicsPass (iree-preprocessing-pad-to-intrinsics) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After GeneralizeLinalgMatMulPass (iree-preprocessing-generalize-linalg-matmul-experimental) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After AttrBasedPipelinePass (iree-preprocessing-attr-based-pipeline) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After WarnOnUninitializedValuesPass (iree-global-opt-warn-on-uninitialized-values) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After StripDebugOpsPass (iree-util-strip-debug-ops) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %8 : tensor<2x32x10x16384xf32>, tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After GeneralizeLinalgMatMulPass (iree-preprocessing-generalize-linalg-matmul-experimental) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After AttrBasedPipelinePass (iree-preprocessing-attr-based-pipeline) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After WarnOnUninitializedValuesPass (iree-global-opt-warn-on-uninitialized-values) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After StripDebugOpsPass (iree-util-strip-debug-ops) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After PropagateLinalgTransposePass (iree-global-opt-propagate-linalg-transpose) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After PropagateLinalgTransposePass (iree-global-opt-propagate-linalg-transpose) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPOPass (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.subf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8 : tensor<2x32x10x16384xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.mulf %in, %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %out : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = arith.addf %in, %cst_0 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%18 = math.rsqrt %in : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %out: f32): | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %13 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%18 = arith.mulf %in, %in_2 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%16 = hal.tensor.barrier join(%15 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%17 = hal.tensor.export %16 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %17 : !hal.buffer_view | |
} | |
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%12 = arith.extf %in : f16 to f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.addf %in, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%12 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%12 = arith.subf %in, %in_2 : f32 | |
%13 = arith.mulf %12, %12 : f32 | |
%14 = arith.addf %13, %out : f32 | |
linalg.yield %14 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%12 = arith.divf %in_3, %cst_1 : f32 | |
%13 = arith.addf %12, %cst_0 : f32 | |
%14 = math.rsqrt %13 : f32 | |
%15 = arith.extf %in : f16 to f32 | |
%16 = arith.subf %15, %in_2 : f32 | |
%17 = arith.mulf %16, %14 : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = hal.tensor.barrier join(%9 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%11 = hal.tensor.export %10 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %11 : !hal.buffer_view | |
} | |
// -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 9.99999974E-6 : f32 | |
%cst_1 = arith.constant 1.638400e+05 : f32 | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%9 = arith.extf %in : f16 to f32 | |
linalg.yield %9 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%4 = tensor.empty() : tensor<2x32xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = flow.dispatch.region -> (tensor<2x32x10x16384xf32>) { | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%13 = arith.addf %in, %out : f32 | |
linalg.yield %13 : f32 | |
} -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%4 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%13 = arith.divf %in, %cst_1 : f32 | |
linalg.yield %13 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %10 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%13 = arith.subf %in, %in_2 : f32 | |
%14 = arith.mulf %13, %13 : f32 | |
%15 = arith.addf %14, %out : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %10, %11 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%13 = arith.divf %in_3, %cst_1 : f32 | |
%14 = arith.addf %13, %cst_0 : f32 | |
%15 = math.rsqrt %14 : f32 | |
%16 = arith.extf %in : f16 to f32 | |
%17 = arith.subf %16, %in_2 : f32 | |
%18 = arith.mulf %17, %15 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.return %12 : tensor<2x32x10x16384xf32> | |
} | |
%7 = hal.tensor.barrier join(%6 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%8 = hal.tensor.export %7 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %8 : !hal.buffer_view | |
} | |
// -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch.region -> (tensor<2x32x10x16384xf32>) { | |
%5 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%6 = tensor.empty() : tensor<2x32xf32> | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%5 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%13 = arith.extf %in : f16 to f32 | |
linalg.yield %13 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%8 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%8 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%13 = arith.addf %in, %out : f32 | |
linalg.yield %13 : f32 | |
} -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%6 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%13 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %13 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7, %10 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%13 = arith.subf %in, %in_2 : f32 | |
%14 = arith.mulf %13, %13 : f32 | |
%15 = arith.addf %14, %out : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %10, %11 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%13 = arith.divf %in_3, %cst_0 : f32 | |
%14 = arith.addf %13, %cst : f32 | |
%15 = math.rsqrt %14 : f32 | |
%16 = arith.extf %in : f16 to f32 | |
%17 = arith.subf %16, %in_2 : f32 | |
%18 = arith.mulf %17, %15 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.return %12 : tensor<2x32x10x16384xf32> | |
} | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch.region -> (tensor<2x32x10x16384xf32>) { | |
%5 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%6 = tensor.empty() : tensor<2x32xf32> | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%5 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%13 = arith.extf %in : f16 to f32 | |
linalg.yield %13 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%8 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%8 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%13 = arith.addf %in, %out : f32 | |
linalg.yield %13 : f32 | |
} -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%6 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%13 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %13 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7, %10 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%13 = arith.subf %in, %in_2 : f32 | |
%14 = arith.mulf %13, %13 : f32 | |
%15 = arith.addf %14, %out : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %10, %11 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%13 = arith.divf %in_3, %cst_0 : f32 | |
%14 = arith.addf %13, %cst : f32 | |
%15 = math.rsqrt %14 : f32 | |
%16 = arith.extf %in : f16 to f32 | |
%17 = arith.subf %16, %in_2 : f32 | |
%18 = arith.mulf %17, %15 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.return %12 : tensor<2x32x10x16384xf32> | |
} | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After ConvertEncodingToFlowPass (iree-dispatch-creation-convert-encoding-to-flow) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch.region -> (tensor<2x32x10x16384xf32>) { | |
%5 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%6 = tensor.empty() : tensor<2x32xf32> | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%5 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%13 = arith.extf %in : f16 to f32 | |
linalg.yield %13 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%8 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%8 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%13 = arith.addf %in, %out : f32 | |
linalg.yield %13 : f32 | |
} -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%6 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%13 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %13 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7, %10 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%13 = arith.subf %in, %in_2 : f32 | |
%14 = arith.mulf %13, %13 : f32 | |
%15 = arith.addf %14, %out : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %10, %11 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%13 = arith.divf %in_3, %cst_0 : f32 | |
%14 = arith.addf %13, %cst : f32 | |
%15 = math.rsqrt %14 : f32 | |
%16 = arith.extf %in : f16 to f32 | |
%17 = arith.subf %16, %in_2 : f32 | |
%18 = arith.mulf %17, %15 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.return %12 : tensor<2x32x10x16384xf32> | |
} | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After ConvertEncodingToFlowPass (iree-dispatch-creation-convert-encoding-to-flow) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch.region -> (tensor<2x32x10x16384xf32>) { | |
%5 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%6 = tensor.empty() : tensor<2x32xf32> | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%5 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%13 = arith.extf %in : f16 to f32 | |
linalg.yield %13 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%8 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7 : tensor<2x32x10x16384xf32>) outs(%8 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%13 = arith.addf %in, %out : f32 | |
linalg.yield %13 : f32 | |
} -> tensor<2x32xf32> | |
%10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%6 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%13 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %13 : f32 | |
} -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%7, %10 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%13 = arith.subf %in, %in_2 : f32 | |
%14 = arith.mulf %13, %13 : f32 | |
%15 = arith.addf %14, %out : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %10, %11 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%13 = arith.divf %in_3, %cst_0 : f32 | |
%14 = arith.addf %13, %cst : f32 | |
%15 = math.rsqrt %14 : f32 | |
%16 = arith.extf %in : f16 to f32 | |
%17 = arith.subf %16, %in_2 : f32 | |
%18 = arith.mulf %17, %15 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.return %12 : tensor<2x32x10x16384xf32> | |
} | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> = | |
(%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) { | |
%5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%7 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%cst = arith.constant 9.99999974E-6 : f32 | |
%8 = tensor.empty() : tensor<2x32xf32> | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%15 = arith.extf %in : f16 to f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.fill ins(%cst_1 : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.addf %in, %out : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%15 = arith.subf %in, %in_2 : f32 | |
%16 = arith.mulf %15, %15 : f32 | |
%17 = arith.addf %16, %out : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%15 = arith.divf %in_3, %cst_0 : f32 | |
%16 = arith.addf %15, %cst : f32 | |
%17 = math.rsqrt %16 : f32 | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
%20 = arith.mulf %19, %17 : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
flow.return | |
} | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> = | |
(%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 9.99999974E-6 : f32 | |
%5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%7 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%8 = tensor.empty() : tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%15 = arith.extf %in : f16 to f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.addf %in, %out : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%15 = arith.subf %in, %in_2 : f32 | |
%16 = arith.mulf %15, %15 : f32 | |
%17 = arith.addf %16, %out : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%15 = arith.divf %in_3, %cst_0 : f32 | |
%16 = arith.addf %15, %cst_1 : f32 | |
%17 = math.rsqrt %16 : f32 | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
%20 = arith.mulf %19, %17 : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
flow.return | |
} | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> = | |
(%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 9.99999974E-6 : f32 | |
%5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%7 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%8 = tensor.empty() : tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%15 = arith.extf %in : f16 to f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.addf %in, %out : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%15 = arith.subf %in, %in_2 : f32 | |
%16 = arith.mulf %15, %15 : f32 | |
%17 = arith.addf %16, %out : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%15 = arith.divf %in_3, %cst_0 : f32 | |
%16 = arith.addf %15, %cst_1 : f32 | |
%17 = math.rsqrt %16 : f32 | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
%20 = arith.mulf %19, %17 : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
flow.return | |
} | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> = | |
(%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 9.99999974E-6 : f32 | |
%5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%7 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%8 = tensor.empty() : tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%15 = arith.extf %in : f16 to f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.addf %in, %out : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%15 = arith.subf %in, %in_2 : f32 | |
%16 = arith.mulf %15, %15 : f32 | |
%17 = arith.addf %16, %out : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%15 = arith.divf %in_3, %cst_0 : f32 | |
%16 = arith.addf %15, %cst_1 : f32 | |
%17 = math.rsqrt %16 : f32 | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
%20 = arith.mulf %19, %17 : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
flow.return | |
} | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> = | |
(%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 9.99999974E-6 : f32 | |
%5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%7 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%8 = tensor.empty() : tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%15 = arith.extf %in : f16 to f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.addf %in, %out : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%15 = arith.subf %in, %in_2 : f32 | |
%16 = arith.mulf %15, %15 : f32 | |
%17 = arith.addf %16, %out : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%15 = arith.divf %in_3, %cst_0 : f32 | |
%16 = arith.addf %15, %cst_1 : f32 | |
%17 = math.rsqrt %16 : f32 | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
%20 = arith.mulf %19, %17 : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After VerifyInputLegalityPass (iree-verify-input-legality) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> = | |
(%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 9.99999974E-6 : f32 | |
%5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%7 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%8 = tensor.empty() : tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%15 = arith.extf %in : f16 to f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.addf %in, %out : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%15 = arith.subf %in, %in_2 : f32 | |
%16 = arith.mulf %15, %15 : f32 | |
%17 = arith.addf %16, %out : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%15 = arith.divf %in_3, %cst_0 : f32 | |
%16 = arith.addf %15, %cst_1 : f32 | |
%17 = math.rsqrt %16 : f32 | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
%20 = arith.mulf %19, %17 : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After InitializeEmptyTensorsPass (iree-flow-initialize-empty-tensors) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> = | |
(%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 9.99999974E-6 : f32 | |
%5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%7 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%8 = tensor.empty() : tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%15 = arith.extf %in : f16 to f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.addf %in, %out : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%15 = arith.subf %in, %in_2 : f32 | |
%16 = arith.mulf %15, %15 : f32 | |
%17 = arith.addf %16, %out : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%15 = arith.divf %in_3, %cst_0 : f32 | |
%16 = arith.addf %15, %cst_1 : f32 | |
%17 = math.rsqrt %16 : f32 | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
%20 = arith.mulf %19, %17 : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CaptureDynamicDimsPass (iree-flow-capture-dynamic-dims) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> = | |
(%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 9.99999974E-6 : f32 | |
%5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%7 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%8 = tensor.empty() : tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%15 = arith.extf %in : f16 to f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.addf %in, %out : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%15 = arith.subf %in, %in_2 : f32 | |
%16 = arith.mulf %15, %15 : f32 | |
%17 = arith.addf %16, %out : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%15 = arith.divf %in_3, %cst_0 : f32 | |
%16 = arith.addf %15, %cst_1 : f32 | |
%17 = math.rsqrt %16 : f32 | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
%20 = arith.mulf %19, %17 : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> = | |
(%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 9.99999974E-6 : f32 | |
%5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%7 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%8 = tensor.empty() : tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%15 = arith.extf %in : f16 to f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.addf %in, %out : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%15 = arith.subf %in, %in_2 : f32 | |
%16 = arith.mulf %15, %15 : f32 | |
%17 = arith.addf %16, %out : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%15 = arith.divf %in_3, %cst_0 : f32 | |
%16 = arith.addf %15, %cst_1 : f32 | |
%17 = math.rsqrt %16 : f32 | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
%20 = arith.mulf %19, %17 : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> = | |
(%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 9.99999974E-6 : f32 | |
%5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%7 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%8 = tensor.empty() : tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%15 = arith.extf %in : f16 to f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.addf %in, %out : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%15 = arith.subf %in, %in_2 : f32 | |
%16 = arith.mulf %15, %15 : f32 | |
%17 = arith.addf %16, %out : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%15 = arith.divf %in_3, %cst_0 : f32 | |
%16 = arith.addf %15, %cst_1 : f32 | |
%17 = math.rsqrt %16 : f32 | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
%20 = arith.mulf %19, %17 : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After InitializeEmptyTensorsPass (iree-flow-initialize-empty-tensors) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CaptureDynamicDimsPass (iree-flow-capture-dynamic-dims) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After OutlineDispatchExternsPass (iree-flow-outline-dispatch-externs) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch.workgroups(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> = | |
(%arg4: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg5: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg6: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 9.99999974E-6 : f32 | |
%5 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%6 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%7 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%8 = tensor.empty() : tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%5 : tensor<2x32x10x16384xf16>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%15 = arith.extf %in : f16 to f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%10 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%11 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9 : tensor<2x32x10x16384xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.addf %in, %out : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%15 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32xf32> | |
%13 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%9, %12 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%10 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%15 = arith.subf %in, %in_2 : f32 | |
%16 = arith.mulf %15, %15 : f32 | |
%17 = arith.addf %16, %out : f32 | |
linalg.yield %17 : f32 | |
} -> tensor<2x32xf32> | |
%14 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6, %12, %13 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%7 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%15 = arith.divf %in_3, %cst_0 : f32 | |
%16 = arith.addf %15, %cst_1 : f32 | |
%17 = math.rsqrt %16 : f32 | |
%18 = arith.extf %in : f16 to f32 | |
%19 = arith.subf %18, %in_2 : f32 | |
%20 = arith.mulf %19, %17 : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %14, %arg6, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
flow.return | |
} count() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After OutlineDispatchRegionsPass (iree-flow-outline-dispatch-regions) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
flow.executable private @main$async_dispatch_0 { | |
flow.executable.export public @main$async_dispatch_0 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 9.99999974E-6 : f32 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = tensor.empty() : tensor<2x32xf32> | |
%4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%10 = arith.extf %in : f16 to f32 | |
linalg.yield %10 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%10 = arith.addf %in, %out : f32 | |
linalg.yield %10 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%3 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%10 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %10 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%10 = arith.subf %in, %in_2 : f32 | |
%11 = arith.mulf %10, %10 : f32 | |
%12 = arith.addf %11, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%10 = arith.divf %in_3, %cst_0 : f32 | |
%11 = arith.addf %10, %cst_1 : f32 | |
%12 = math.rsqrt %11 : f32 | |
%13 = arith.extf %in : f16 to f32 | |
%14 = arith.subf %13, %in_2 : f32 | |
%15 = arith.mulf %14, %12 : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %9, %arg2, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After AnnotateDispatchesPass (iree-flow-annotate-dispatches) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
flow.executable private @main$async_dispatch_0 { | |
flow.executable.export public @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%arg0: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 9.99999974E-6 : f32 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = tensor.empty() : tensor<2x32xf32> | |
%4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%10 = arith.extf %in : f16 to f32 | |
linalg.yield %10 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%10 = arith.addf %in, %out : f32 | |
linalg.yield %10 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%3 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%10 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %10 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%10 = arith.subf %in, %in_2 : f32 | |
%11 = arith.mulf %10, %10 : f32 | |
%12 = arith.addf %11, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%10 = arith.divf %in_3, %cst_0 : f32 | |
%11 = arith.addf %10, %cst_1 : f32 | |
%12 = math.rsqrt %11 : f32 | |
%13 = arith.extf %in : f16 to f32 | |
%14 = arith.subf %13, %in_2 : f32 | |
%15 = arith.mulf %14, %12 : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %9, %arg2, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After StripDebugOpsPass (iree-util-strip-debug-ops) //----- // | |
flow.executable private @main$async_dispatch_0 { | |
flow.executable.export public @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%arg0: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 9.99999974E-6 : f32 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = tensor.empty() : tensor<2x32xf32> | |
%4 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%10 = arith.extf %in : f16 to f32 | |
linalg.yield %10 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%10 = arith.addf %in, %out : f32 | |
linalg.yield %10 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%3 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%10 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %10 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%10 = arith.subf %in, %in_2 : f32 | |
%11 = arith.mulf %10, %10 : f32 | |
%12 = arith.addf %11, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%10 = arith.divf %in_3, %cst_0 : f32 | |
%11 = arith.addf %10, %cst_1 : f32 | |
%12 = math.rsqrt %11 : f32 | |
%13 = arith.extf %in : f16 to f32 | |
%14 = arith.subf %13, %in_2 : f32 | |
%15 = arith.mulf %14, %12 : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %9, %arg2, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After DeduplicateExecutablesPass (iree-flow-deduplicate-executables) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
flow.executable private @main$async_dispatch_0 { | |
flow.executable.export public @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%arg0: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 9.99999974E-6 : f32 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = tensor.empty() : tensor<2x32xf32> | |
%4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%10 = arith.extf %in : f16 to f32 | |
linalg.yield %10 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%10 = arith.addf %in, %out : f32 | |
linalg.yield %10 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%3 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%10 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %10 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%10 = arith.subf %in, %in_2 : f32 | |
%11 = arith.mulf %10, %10 : f32 | |
%12 = arith.addf %11, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%10 = arith.divf %in_3, %cst_0 : f32 | |
%11 = arith.addf %10, %cst_1 : f32 | |
%12 = math.rsqrt %11 : f32 | |
%13 = arith.extf %in : f16 to f32 | |
%14 = arith.subf %13, %in_2 : f32 | |
%15 = arith.mulf %14, %12 : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %9, %arg2, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CleanupTensorShapesPass (iree-flow-cleanup-tensor-shapes) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CleanupTensorShapesPass (iree-flow-cleanup-tensor-shapes) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After OutlineConstantsPass (iree-flow-outline-constants) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
flow.executable private @main$async_dispatch_0 { | |
flow.executable.export public @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%arg0: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 9.99999974E-6 : f32 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = tensor.empty() : tensor<2x32xf32> | |
%4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%10 = arith.extf %in : f16 to f32 | |
linalg.yield %10 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%10 = arith.addf %in, %out : f32 | |
linalg.yield %10 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%3 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%10 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %10 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%10 = arith.subf %in, %in_2 : f32 | |
%11 = arith.mulf %10, %10 : f32 | |
%12 = arith.addf %11, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%10 = arith.divf %in_3, %cst_0 : f32 | |
%11 = arith.addf %10, %cst_1 : f32 | |
%12 = math.rsqrt %11 : f32 | |
%13 = arith.extf %in : f16 to f32 | |
%14 = arith.subf %13, %in_2 : f32 | |
%15 = arith.mulf %14, %12 : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
flow.dispatch.tensor.store %9, %arg2, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : tensor<2x32x10x16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>> | |
return | |
} | |
} | |
} | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} { | |
%0 = hal.tensor.import wait(%arg2) => %arg0 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%1 = hal.tensor.import wait(%arg2) => %arg1 : !hal.buffer_view -> tensor<2x32x10x16384xf16> | |
%2 = flow.dispatch @main$async_dispatch_0::@main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%0, %1) : (tensor<2x32x10x16384xf16>, tensor<2x32x10x16384xf16>) -> tensor<2x32x10x16384xf32> | |
%3 = hal.tensor.barrier join(%2 : tensor<2x32x10x16384xf32>) => %arg3 : !hal.fence | |
%4 = hal.tensor.export %3 : tensor<2x32x10x16384xf32> -> !hal.buffer_view | |
util.return %4 : !hal.buffer_view | |
} | |
// -----// IR Dump After OptimizeIntArithmeticPass (iree-util-optimize-int-arithmetic) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatternsPass (iree-util-apply-patterns) //----- // | |
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = util.null : !hal.fence | |
%c-1_i32 = arith.constant -1 : i32 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
%1 = util.call @main$async(%arg0, %arg1, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) flags("None") : i32 | |
util.return %1 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> | |
#map2 = affine_map<(d0, d1) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
flow.executable private @main$async_dispatch_0 { | |
flow.executable.export public @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32 workgroups() -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_0_elementwise_2x32x10x16384_f16xf32xf32xf32(%arg0: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2x32x10x16384xf32>>) { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant 1.638400e+05 : f32 | |
%cst_1 = arith.constant 9.99999974E-6 : f32 | |
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 16384], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x16384xf16>> -> tensor<2x32x10x16384xf16> | |
%2 = tensor.empty() : tensor<2x32x10x16384xf32> | |
%3 = tensor.empty() : tensor<2x32xf32> | |
%4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<2x32x10x16384xf16>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %out: f32): | |
%10 = arith.extf %in : f16 to f32 | |
linalg.yield %10 : f32 | |
} -> tensor<2x32x10x16384xf32> | |
%5 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x32xf32>) -> tensor<2x32xf32> | |
%6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4 : tensor<2x32x10x16384xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%10 = arith.addf %in, %out : f32 | |
linalg.yield %10 : f32 | |
} -> tensor<2x32xf32> | |
%7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<2x32xf32>) outs(%3 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%10 = arith.divf %in, %cst_0 : f32 | |
linalg.yield %10 : f32 | |
} -> tensor<2x32xf32> | |
%8 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4, %7 : tensor<2x32x10x16384xf32>, tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) { | |
^bb0(%in: f32, %in_2: f32, %out: f32): | |
%10 = arith.subf %in, %in_2 : f32 | |
%11 = arith.mulf %10, %10 : f32 | |
%12 = arith.addf %11, %out : f32 | |
linalg.yield %12 : f32 | |
} -> tensor<2x32xf32> | |
%9 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %7, %8 : tensor<2x32x10x16384xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32x10x16384xf32>) { | |
^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32): | |
%10 = arith.divf %in_3, %cst_0 : f32 | |
%11 = arith.addf %10, %cst_1 : f32 | |
%12 = math.rsqrt %11 : f32 | |
%13 = arith.extf %in : f16 to f32 | |
%14 = ari |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment